42 files changed, 8873 insertions, 933 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 8ca47a5ee9c..3c2b6739c87 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -370,3 +370,55 @@ config CLEANCACHE
          in a negligible performance hit.
          If unsure, say Y to enable cleancache
+config CMA
+        bool "Contiguous Memory Allocator framework"
+        # Currently there is only one allocator so force it on
+        select CMA_BEST_FIT
+        help
+          This enables the Contiguous Memory Allocator framework which
+          allows drivers to allocate big physically-contiguous blocks of
+          memory for use with hardware components that do not support I/O
+          map nor scatter-gather.
+          If you select this option you will also have to select at least
+          one allocator algorithm below.
+          To make use of CMA you need to specify the regions and
+          driver->region mapping on command line when booting the kernel.
+config CMA_DEVELOPEMENT
+        bool "Include CMA developement features"
+        depends on CMA
+        help
+          This lets you enable some developement features of the CMA
+          freamework.
+config CMA_DEBUG
+        bool "CMA debug messages"
+        depends on CMA_DEVELOPEMENT
+        help
+          Enable debug messages in CMA code.
+config CMA_SYSFS
+        bool "CMA SysFS interface support"
+        depends on CMA_DEVELOPEMENT
+        help
+          Enable support for SysFS interface.
+config CMA_CMDLINE
+        bool "CMA command line parameters support"
+        depends on CMA_DEVELOPEMENT
+        help
+          Enable support for cma, cma.map and cma.asterisk command line
+          parameters.
+config CMA_BEST_FIT
+        bool "CMA best-fit allocator"
+        depends on CMA
+        help
+          This is a best-fit algorithm running in O(n log n) time where
+          n is the number of existing holes (which is never greater then
+          the number of allocated regions and usually much smaller).  It
+          allocates area from the smallest hole that is big enough for
+          allocation in question.
diff --git a/mm/Makefile b/mm/Makefile
index 836e4163c1b..f846ad087a1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
 obj-$(CONFIG_SPARSEMEM) += sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
+obj-$(CONFIG_ASHMEM) += ashmem.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_COMPACTION) += compaction.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
@@ -37,6 +38,7 @@ obj-$(CONFIG_KSM) += ksm.o
 obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
+obj-$(CONFIG_SLQB) += slqb.o
 obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
@@ -50,3 +52,5 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_CMA) += cma.o
+obj-$(CONFIG_CMA_BEST_FIT) += cma-best-fit.o
diff --git a/mm/ashmem.c b/mm/ashmem.c
new file mode 100644
index 00000000000..66e3f23ee33
--- /dev/null
+++ b/mm/ashmem.c
@@ -0,0 +1,748 @@
+/* mm/ashmem.c
+**
+** Anonymous Shared Memory Subsystem, ashmem
+**
+** Copyright (C) 2008 Google, Inc.
+**
+** Robert Love <rlove@google.com>
+**
+** This software is licensed under the terms of the GNU General Public
+** License version 2, as published by the Free Software Foundation, and
+** may be copied, distributed, and modified under those terms.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+*/
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/security.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/personality.h>
+#include <linux/bitops.h>
+#include <linux/mutex.h>
+#include <linux/shmem_fs.h>
+#include <linux/ashmem.h>
+#define ASHMEM_NAME_PREFIX "dev/ashmem/"
+#define ASHMEM_NAME_PREFIX_LEN (sizeof(ASHMEM_NAME_PREFIX) - 1)
+#define ASHMEM_FULL_NAME_LEN (ASHMEM_NAME_LEN + ASHMEM_NAME_PREFIX_LEN)
+/*
+ * ashmem_area - anonymous shared memory area
+ * Lifecycle: From our parent file's open() until its release()
+ * Locking: Protected by `ashmem_mutex'
+ * Big Note: Mappings do NOT pin this structure; it dies on close()
+ */
+struct ashmem_area {
+        char name[ASHMEM_FULL_NAME_LEN];/* optional name for /proc/pid/maps */
+        struct list_head unpinned_list; /* list of all ashmem areas */
+        struct file *file;              /* the shmem-based backing file */
+        size_t size;                    /* size of the mapping, in bytes */
+        unsigned long prot_mask;        /* allowed prot bits, as vm_flags */
+};
+/*
+ * ashmem_range - represents an interval of unpinned (evictable) pages
+ * Lifecycle: From unpin to pin
+ * Locking: Protected by `ashmem_mutex'
+ */
+struct ashmem_range {
+        struct list_head lru;           /* entry in LRU list */
+        struct list_head unpinned;      /* entry in its area's unpinned list */
+        struct ashmem_area *asma;       /* associated area */
+        size_t pgstart;                 /* starting page, inclusive */
+        size_t pgend;                   /* ending page, inclusive */
+        unsigned int purged;            /* ASHMEM_NOT or ASHMEM_WAS_PURGED */
+};
+/* LRU list of unpinned pages, protected by ashmem_mutex */
+static LIST_HEAD(ashmem_lru_list);
+/* Count of pages on our LRU list, protected by ashmem_mutex */
+static unsigned long lru_count;
+/*
+ * ashmem_mutex - protects the list of and each individual ashmem_area
+ *
+ * Lock Ordering: ashmex_mutex -> i_mutex -> i_alloc_sem
+ */
+static DEFINE_MUTEX(ashmem_mutex);
+static struct kmem_cache *ashmem_area_cachep __read_mostly;
+static struct kmem_cache *ashmem_range_cachep __read_mostly;
+#define range_size(range) \
+  ((range)->pgend - (range)->pgstart + 1)
+#define range_on_lru(range) \
+  ((range)->purged == ASHMEM_NOT_PURGED)
+#define page_range_subsumes_range(range, start, end) \
+  (((range)->pgstart >= (start)) && ((range)->pgend <= (end)))
+#define page_range_subsumed_by_range(range, start, end) \
+  (((range)->pgstart <= (start)) && ((range)->pgend >= (end)))
+#define page_in_range(range, page) \
+ (((range)->pgstart <= (page)) && ((range)->pgend >= (page)))
+#define page_range_in_range(range, start, end) \
+  (page_in_range(range, start) || page_in_range(range, end) || \
+   page_range_subsumes_range(range, start, end))
+#define range_before_page(range, page) \
+  ((range)->pgend < (page))
+#define PROT_MASK               (PROT_EXEC | PROT_READ | PROT_WRITE)
+static inline void lru_add(struct ashmem_range *range)
+{
+        list_add_tail(&range->lru, &ashmem_lru_list);
+        lru_count += range_size(range);
+}
+static inline void lru_del(struct ashmem_range *range)
+{
+        list_del(&range->lru);
+        lru_count -= range_size(range);
+}
+/*
+ * range_alloc - allocate and initialize a new ashmem_range structure
+ *
+ * 'asma' - associated ashmem_area
+ * 'prev_range' - the previous ashmem_range in the sorted asma->unpinned list
+ * 'purged' - initial purge value (ASMEM_NOT_PURGED or ASHMEM_WAS_PURGED)
+ * 'start' - starting page, inclusive
+ * 'end' - ending page, inclusive
+ *
+ * Caller must hold ashmem_mutex.
+ */
+static int range_alloc(struct ashmem_area *asma,
+                       struct ashmem_range *prev_range, unsigned int purged,
+                       size_t start, size_t end)
+{
+        struct ashmem_range *range;
+        range = kmem_cache_zalloc(ashmem_range_cachep, GFP_KERNEL);
+        if (unlikely(!range))
+                return -ENOMEM;
+        range->asma = asma;
+        range->pgstart = start;
+        range->pgend = end;
+        range->purged = purged;
+        list_add_tail(&range->unpinned, &prev_range->unpinned);
+        if (range_on_lru(range))
+                lru_add(range);
+        return 0;
+}
+static void range_del(struct ashmem_range *range)
+{
+        list_del(&range->unpinned);
+        if (range_on_lru(range))
+                lru_del(range);
+        kmem_cache_free(ashmem_range_cachep, range);
+}
+/*
+ * range_shrink - shrinks a range
+ *
+ * Caller must hold ashmem_mutex.
+ */
+static inline void range_shrink(struct ashmem_range *range,
+                                size_t start, size_t end)
+{
+        size_t pre = range_size(range);
+        range->pgstart = start;
+        range->pgend = end;
+        if (range_on_lru(range))
+                lru_count -= pre - range_size(range);
+}
+static int ashmem_open(struct inode *inode, struct file *file)
+{
+        struct ashmem_area *asma;
+        int ret;
+        ret = generic_file_open(inode, file);
+        if (unlikely(ret))
+                return ret;
+        asma = kmem_cache_zalloc(ashmem_area_cachep, GFP_KERNEL);
+        if (unlikely(!asma))
+                return -ENOMEM;
+        INIT_LIST_HEAD(&asma->unpinned_list);
+        memcpy(asma->name, ASHMEM_NAME_PREFIX, ASHMEM_NAME_PREFIX_LEN);
+        asma->prot_mask = PROT_MASK;
+        file->private_data = asma;
+        return 0;
+}
+static int ashmem_release(struct inode *ignored, struct file *file)
+{
+        struct ashmem_area *asma = file->private_data;
+        struct ashmem_range *range, *next;
+        mutex_lock(&ashmem_mutex);
+        list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned)
+                range_del(range);
+        mutex_unlock(&ashmem_mutex);
+        if (asma->file)
+                fput(asma->file);
+        kmem_cache_free(ashmem_area_cachep, asma);
+        return 0;
+}
+static ssize_t ashmem_read(struct file *file, char __user *buf,
+                           size_t len, loff_t *pos)
+{
+        struct ashmem_area *asma = file->private_data;
+        int ret = 0;
+        mutex_lock(&ashmem_mutex);
+        /* If size is not set, or set to 0, always return EOF. */
+        if (asma->size == 0) {
+                goto out;
+        }
+        if (!asma->file) {
+                ret = -EBADF;
+                goto out;
+        }
+        ret = asma->file->f_op->read(asma->file, buf, len, pos);
+        if (ret < 0) {
+                goto out;
+        }
+        /** Update backing file pos, since f_ops->read() doesn't */
+        asma->file->f_pos = *pos;
+out:
+        mutex_unlock(&ashmem_mutex);
+        return ret;
+}
+static loff_t ashmem_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct ashmem_area *asma = file->private_data;
+        int ret;
+        mutex_lock(&ashmem_mutex);
+        if (asma->size == 0) {
+                ret = -EINVAL;
+                goto out;
+        }
+        if (!asma->file) {
+                ret = -EBADF;
+                goto out;
+        }
+        ret = asma->file->f_op->llseek(asma->file, offset, origin);
+        if (ret < 0) {
+                goto out;
+        }
+        /** Copy f_pos from backing file, since f_ops->llseek() sets it */
+        file->f_pos = asma->file->f_pos;
+out:
+        mutex_unlock(&ashmem_mutex);
+        return ret;
+}
+static inline unsigned long
+calc_vm_may_flags(unsigned long prot)
+{
+        return _calc_vm_trans(prot, PROT_READ,  VM_MAYREAD ) |
+               _calc_vm_trans(prot, PROT_WRITE, VM_MAYWRITE) |
+               _calc_vm_trans(prot, PROT_EXEC,  VM_MAYEXEC);
+}
+static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        struct ashmem_area *asma = file->private_data;
+        int ret = 0;
+        mutex_lock(&ashmem_mutex);
+        /* user needs to SET_SIZE before mapping */
+        if (unlikely(!asma->size)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /* requested protection bits must match our allowed protection mask */
+        if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) &
+                                                calc_vm_prot_bits(PROT_MASK))) {
+                ret = -EPERM;
+                goto out;
+        }
+        vma->vm_flags &= ~calc_vm_may_flags(~asma->prot_mask);
+        if (!asma->file) {
+                char *name = ASHMEM_NAME_DEF;
+                struct file *vmfile;
+                if (asma->name[ASHMEM_NAME_PREFIX_LEN] != '\0')
+                        name = asma->name;
+                /* ... and allocate the backing shmem file */
+                vmfile = shmem_file_setup(name, asma->size, vma->vm_flags);
+                if (unlikely(IS_ERR(vmfile))) {
+                        ret = PTR_ERR(vmfile);
+                        goto out;
+                }
+                asma->file = vmfile;
+        }
+        get_file(asma->file);
+        if (vma->vm_flags & VM_SHARED)
+                shmem_set_file(vma, asma->file);
+        else {
+                if (vma->vm_file)
+                        fput(vma->vm_file);
+                vma->vm_file = asma->file;
+        }
+        vma->vm_flags |= VM_CAN_NONLINEAR;
+out:
+        mutex_unlock(&ashmem_mutex);
+        return ret;
+}
+/*
+ * ashmem_shrink - our cache shrinker, called from mm/vmscan.c :: shrink_slab
+ *
+ * 'nr_to_scan' is the number of objects (pages) to prune, or 0 to query how
+ * many objects (pages) we have in total.
+ *
+ * 'gfp_mask' is the mask of the allocation that got us into this mess.
+ *
+ * Return value is the number of objects (pages) remaining, or -1 if we cannot
+ * proceed without risk of deadlock (due to gfp_mask).
+ *
+ * We approximate LRU via least-recently-unpinned, jettisoning unpinned partial
+ * chunks of ashmem regions LRU-wise one-at-a-time until we hit 'nr_to_scan'
+ * pages freed.
+ */
+static int ashmem_shrink(struct shrinker *s, struct shrink_control *sc)
+{
+        struct ashmem_range *range, *next;
+        /* We might recurse into filesystem code, so bail out if necessary */
+        if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
+                return -1;
+        if (!sc->nr_to_scan)
+                return lru_count;
+        mutex_lock(&ashmem_mutex);
+        list_for_each_entry_safe(range, next, &ashmem_lru_list, lru) {
+                struct inode *inode = range->asma->file->f_dentry->d_inode;
+                loff_t start = range->pgstart * PAGE_SIZE;
+                loff_t end = (range->pgend + 1) * PAGE_SIZE - 1;
+                vmtruncate_range(inode, start, end);
+                range->purged = ASHMEM_WAS_PURGED;
+                lru_del(range);
+                sc->nr_to_scan -= range_size(range);
+                if (sc->nr_to_scan <= 0)
+                        break;
+        }
+        mutex_unlock(&ashmem_mutex);
+        return lru_count;
+}
+static struct shrinker ashmem_shrinker = {
+        .shrink = ashmem_shrink,
+        .seeks = DEFAULT_SEEKS * 4,
+};
+static int set_prot_mask(struct ashmem_area *asma, unsigned long prot)
+{
+        int ret = 0;
+        mutex_lock(&ashmem_mutex);
+        /* the user can only remove, not add, protection bits */
+        if (unlikely((asma->prot_mask & prot) != prot)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /* does the application expect PROT_READ to imply PROT_EXEC? */
+        if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
+                prot |= PROT_EXEC;
+        asma->prot_mask = prot;
+out:
+        mutex_unlock(&ashmem_mutex);
+        return ret;
+}
+static int set_name(struct ashmem_area *asma, void __user *name)
+{
+        int ret = 0;
+        mutex_lock(&ashmem_mutex);
+        /* cannot change an existing mapping's name */
+        if (unlikely(asma->file)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        if (unlikely(copy_from_user(asma->name + ASHMEM_NAME_PREFIX_LEN,
+                                    name, ASHMEM_NAME_LEN)))
+                ret = -EFAULT;
+        asma->name[ASHMEM_FULL_NAME_LEN-1] = '\0';
+out:
+        mutex_unlock(&ashmem_mutex);
+        return ret;
+}
+static int get_name(struct ashmem_area *asma, void __user *name)
+{
+        int ret = 0;
+        mutex_lock(&ashmem_mutex);
+        if (asma->name[ASHMEM_NAME_PREFIX_LEN] != '\0') {
+                size_t len;
+                /*
+                 * Copying only `len', instead of ASHMEM_NAME_LEN, bytes
+                 * prevents us from revealing one user's stack to another.
+                 */
+                len = strlen(asma->name + ASHMEM_NAME_PREFIX_LEN) + 1;
+                if (unlikely(copy_to_user(name,
+                                asma->name + ASHMEM_NAME_PREFIX_LEN, len)))
+                        ret = -EFAULT;
+        } else {
+                if (unlikely(copy_to_user(name, ASHMEM_NAME_DEF,
+                                          sizeof(ASHMEM_NAME_DEF))))
+                        ret = -EFAULT;
+        }
+        mutex_unlock(&ashmem_mutex);
+        return ret;
+}
+/*
+ * ashmem_pin - pin the given ashmem region, returning whether it was
+ * previously purged (ASHMEM_WAS_PURGED) or not (ASHMEM_NOT_PURGED).
+ *
+ * Caller must hold ashmem_mutex.
+ */
+static int ashmem_pin(struct ashmem_area *asma, size_t pgstart, size_t pgend)
+{
+        struct ashmem_range *range, *next;
+        int ret = ASHMEM_NOT_PURGED;
+        list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned) {
+                /* moved past last applicable page; we can short circuit */
+                if (range_before_page(range, pgstart))
+                        break;
+                /*
+                 * The user can ask us to pin pages that span multiple ranges,
+                 * or to pin pages that aren't even unpinned, so this is messy.
+                 *
+                 * Four cases:
+                 * 1. The requested range subsumes an existing range, so we
+                 *    just remove the entire matching range.
+                 * 2. The requested range overlaps the start of an existing
+                 *    range, so we just update that range.
+                 * 3. The requested range overlaps the end of an existing
+                 *    range, so we just update that range.
+                 * 4. The requested range punches a hole in an existing range,
+                 *    so we have to update one side of the range and then
+                 *    create a new range for the other side.
+                 */
+                if (page_range_in_range(range, pgstart, pgend)) {
+                        ret |= range->purged;
+                        /* Case #1: Easy. Just nuke the whole thing. */
+                        if (page_range_subsumes_range(range, pgstart, pgend)) {
+                                range_del(range);
+                                continue;
+                        }
+                        /* Case #2: We overlap from the start, so adjust it */
+                        if (range->pgstart >= pgstart) {
+                                range_shrink(range, pgend + 1, range->pgend);
+                                continue;
+                        }
+                        /* Case #3: We overlap from the rear, so adjust it */
+                        if (range->pgend <= pgend) {
+                                range_shrink(range, range->pgstart, pgstart-1);
+                                continue;
+                        }
+                        /*
+                         * Case #4: We eat a chunk out of the middle. A bit
+                         * more complicated, we allocate a new range for the
+                         * second half and adjust the first chunk's endpoint.
+                         */
+                        range_alloc(asma, range, range->purged,
+                                    pgend + 1, range->pgend);
+                        range_shrink(range, range->pgstart, pgstart - 1);
+                        break;
+                }
+        }
+        return ret;
+}
+/*
+ * ashmem_unpin - unpin the given range of pages. Returns zero on success.
+ *
+ * Caller must hold ashmem_mutex.
+ */
+static int ashmem_unpin(struct ashmem_area *asma, size_t pgstart, size_t pgend)
+{
+        struct ashmem_range *range, *next;
+        unsigned int purged = ASHMEM_NOT_PURGED;
+restart:
+        list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned) {
+                /* short circuit: this is our insertion point */
+                if (range_before_page(range, pgstart))
+                        break;
+                /*
+                 * The user can ask us to unpin pages that are already entirely
+                 * or partially pinned. We handle those two cases here.
+                 */
+                if (page_range_subsumed_by_range(range, pgstart, pgend))
+                        return 0;
+                if (page_range_in_range(range, pgstart, pgend)) {
+                        pgstart = min_t(size_t, range->pgstart, pgstart),
+                        pgend = max_t(size_t, range->pgend, pgend);
+                        purged |= range->purged;
+                        range_del(range);
+                        goto restart;
+                }
+        }
+        return range_alloc(asma, range, purged, pgstart, pgend);
+}
+/*
+ * ashmem_get_pin_status - Returns ASHMEM_IS_UNPINNED if _any_ pages in the
+ * given interval are unpinned and ASHMEM_IS_PINNED otherwise.
+ *
+ * Caller must hold ashmem_mutex.
+ */
+static int ashmem_get_pin_status(struct ashmem_area *asma, size_t pgstart,
+                                 size_t pgend)
+{
+        struct ashmem_range *range;
+        int ret = ASHMEM_IS_PINNED;
+        list_for_each_entry(range, &asma->unpinned_list, unpinned) {
+                if (range_before_page(range, pgstart))
+                        break;
+                if (page_range_in_range(range, pgstart, pgend)) {
+                        ret = ASHMEM_IS_UNPINNED;
+                        break;
+                }
+        }
+        return ret;
+}
+static int ashmem_pin_unpin(struct ashmem_area *asma, unsigned long cmd,
+                            void __user *p)
+{
+        struct ashmem_pin pin;
+        size_t pgstart, pgend;
+        int ret = -EINVAL;
+        if (unlikely(!asma->file))
+                return -EINVAL;
+        if (unlikely(copy_from_user(&pin, p, sizeof(pin))))
+                return -EFAULT;
+        /* per custom, you can pass zero for len to mean "everything onward" */
+        if (!pin.len)
+                pin.len = PAGE_ALIGN(asma->size) - pin.offset;
+        if (unlikely((pin.offset | pin.len) & ~PAGE_MASK))
+                return -EINVAL;
+        if (unlikely(((__u32) -1) - pin.offset < pin.len))
+                return -EINVAL;
+        if (unlikely(PAGE_ALIGN(asma->size) < pin.offset + pin.len))
+                return -EINVAL;
+        pgstart = pin.offset / PAGE_SIZE;
+        pgend = pgstart + (pin.len / PAGE_SIZE) - 1;
+        mutex_lock(&ashmem_mutex);
+        switch (cmd) {
+        case ASHMEM_PIN:
+                ret = ashmem_pin(asma, pgstart, pgend);
+                break;
+        case ASHMEM_UNPIN:
+                ret = ashmem_unpin(asma, pgstart, pgend);
+                break;
+        case ASHMEM_GET_PIN_STATUS:
+                ret = ashmem_get_pin_status(asma, pgstart, pgend);
+                break;
+        }
+        mutex_unlock(&ashmem_mutex);
+        return ret;
+}
+static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct ashmem_area *asma = file->private_data;
+        long ret = -ENOTTY;
+        switch (cmd) {
+        case ASHMEM_SET_NAME:
+                ret = set_name(asma, (void __user *) arg);
+                break;
+        case ASHMEM_GET_NAME:
+                ret = get_name(asma, (void __user *) arg);
+                break;
+        case ASHMEM_SET_SIZE:
+                ret = -EINVAL;
+                if (!asma->file) {
+                        ret = 0;
+                        asma->size = (size_t) arg;
+                }
+                break;
+        case ASHMEM_GET_SIZE:
+                ret = asma->size;
+                break;
+        case ASHMEM_SET_PROT_MASK:
+                ret = set_prot_mask(asma, arg);
+                break;
+        case ASHMEM_GET_PROT_MASK:
+                ret = asma->prot_mask;
+                break;
+        case ASHMEM_PIN:
+        case ASHMEM_UNPIN:
+        case ASHMEM_GET_PIN_STATUS:
+                ret = ashmem_pin_unpin(asma, cmd, (void __user *) arg);
+                break;
+        case ASHMEM_PURGE_ALL_CACHES:
+                ret = -EPERM;
+                if (capable(CAP_SYS_ADMIN)) {
+                        struct shrink_control sc = {
+                                .gfp_mask = GFP_KERNEL,
+                                .nr_to_scan = 0,
+                        };
+                        ret = ashmem_shrink(&ashmem_shrinker, &sc);
+                        sc.nr_to_scan = ret;
+                        ashmem_shrink(&ashmem_shrinker, &sc);
+                }
+                break;
+        }
+        return ret;
+}
+static struct file_operations ashmem_fops = {
+        .owner = THIS_MODULE,
+        .open = ashmem_open,
+        .release = ashmem_release,
+        .read = ashmem_read,
+        .llseek = ashmem_llseek,
+        .mmap = ashmem_mmap,
+        .unlocked_ioctl = ashmem_ioctl,
+        .compat_ioctl = ashmem_ioctl,
+};
+static struct miscdevice ashmem_misc = {
+        .minor = MISC_DYNAMIC_MINOR,
+        .name = "ashmem",
+        .fops = &ashmem_fops,
+};
+static int __init ashmem_init(void)
+{
+        int ret;
+        ashmem_area_cachep = kmem_cache_create("ashmem_area_cache",
+                                          sizeof(struct ashmem_area),
+                                          0, 0, NULL);
+        if (unlikely(!ashmem_area_cachep)) {
+                printk(KERN_ERR "ashmem: failed to create slab cache\n");
+                return -ENOMEM;
+        }
+        ashmem_range_cachep = kmem_cache_create("ashmem_range_cache",
+                                          sizeof(struct ashmem_range),
+                                          0, 0, NULL);
+        if (unlikely(!ashmem_range_cachep)) {
+                printk(KERN_ERR "ashmem: failed to create slab cache\n");
+                return -ENOMEM;
+        }
+        ret = misc_register(&ashmem_misc);
+        if (unlikely(ret)) {
+                printk(KERN_ERR "ashmem: failed to register misc device!\n");
+                return ret;
+        }
+        register_shrinker(&ashmem_shrinker);
+        printk(KERN_INFO "ashmem: initialized\n");
+        return 0;
+}
+static void __exit ashmem_exit(void)
+{
+        int ret;
+        unregister_shrinker(&ashmem_shrinker);
+        ret = misc_deregister(&ashmem_misc);
+        if (unlikely(ret))
+                printk(KERN_ERR "ashmem: failed to unregister misc device!\n");
+        kmem_cache_destroy(ashmem_range_cachep);
+        kmem_cache_destroy(ashmem_area_cachep);
+        printk(KERN_INFO "ashmem: unloaded\n");
+}
+module_init(ashmem_init);
+module_exit(ashmem_exit);
+MODULE_LICENSE("GPL");
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f032e6e1e09..cb9f1c2d01a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer;
 static int bdi_sync_supers(void *);
 static void sync_supers_timer_fn(unsigned long);
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+{
+        if (wb1 < wb2) {
+                spin_lock(&wb1->list_lock);
+                spin_lock_nested(&wb2->list_lock, 1);
+        } else {
+                spin_lock(&wb2->list_lock);
+                spin_lock_nested(&wb1->list_lock, 1);
+        }
+}
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -67,34 +78,44 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
        struct inode *inode;
        nr_dirty = nr_io = nr_more_io = 0;
-        spin_lock(&inode_wb_list_lock);
+        spin_lock(&wb->list_lock);
        list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
                nr_dirty++;
        list_for_each_entry(inode, &wb->b_io, i_wb_list)
                nr_io++;
        list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
                nr_more_io++;
-        spin_unlock(&inode_wb_list_lock);
+        spin_unlock(&wb->list_lock);
        global_dirty_limits(&background_thresh, &dirty_thresh);
        bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
 #define K(x) ((x) << (PAGE_SHIFT - 10))
        seq_printf(m,
-                   "BdiWriteback:     %8lu kB\n"
+                   "BdiWriteback:       %10lu kB\n"
-                   "BdiReclaimable:   %8lu kB\n"
+                   "BdiReclaimable:     %10lu kB\n"
-                   "BdiDirtyThresh:   %8lu kB\n"
+                   "BdiDirtyThresh:     %10lu kB\n"
-                   "DirtyThresh:      %8lu kB\n"
+                   "DirtyThresh:        %10lu kB\n"
-                   "BackgroundThresh: %8lu kB\n"
+                   "BackgroundThresh:   %10lu kB\n"
-                   "b_dirty:          %8lu\n"
+                   "BdiDirtied:         %10lu kB\n"
-                   "b_io:             %8lu\n"
+                   "BdiWritten:         %10lu kB\n"
-                   "b_more_io:        %8lu\n"
+                   "BdiWriteBandwidth:  %10lu kBps\n"
-                   "bdi_list:         %8u\n"
+                   "b_dirty:            %10lu\n"
-                   "state:            %8lx\n",
+                   "b_io:               %10lu\n"
+                   "b_more_io:          %10lu\n"
+                   "bdi_list:           %10u\n"
+                   "state:              %10lx\n",
                   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
                   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
-                   K(bdi_thresh), K(dirty_thresh),
+                   K(bdi_thresh),
-                   K(background_thresh), nr_dirty, nr_io, nr_more_io,
+                   K(dirty_thresh),
+                   K(background_thresh),
+                   (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
+                   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+                   (unsigned long) K(bdi->write_bandwidth),
+                   nr_dirty,
+                   nr_io,
+                   nr_more_io,
                   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
@@ -249,18 +270,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
        return wb_has_dirty_io(&bdi->wb);
 }
-static void bdi_flush_io(struct backing_dev_info *bdi)
-{
-        struct writeback_control wbc = {
-                .sync_mode              = WB_SYNC_NONE,
-                .older_than_this        = NULL,
-                .range_cyclic           = 1,
-                .nr_to_write            = 1024,
-        };
-        writeback_inodes_wb(&bdi->wb, &wbc);
-}
 /*
 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
 * or we risk deadlocking on ->s_umount. The longer term solution would be
@@ -352,6 +361,17 @@ static unsigned long bdi_longest_inactive(void)
        return max(5UL * 60 * HZ, interval);
 }
+/*
+ * Clear pending bit and wakeup anybody waiting for flusher thread creation or
+ * shutdown
+ */
+static void bdi_clear_pending(struct backing_dev_info *bdi)
+{
+        clear_bit(BDI_pending, &bdi->state);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&bdi->state, BDI_pending);
+}
 static int bdi_forker_thread(void *ptr)
 {
        struct bdi_writeback *me = ptr;
@@ -383,6 +403,13 @@ static int bdi_forker_thread(void *ptr)
                }
                spin_lock_bh(&bdi_lock);
+                /*
+                 * In the following loop we are going to check whether we have
+                 * some work to do without any synchronization with tasks
+                 * waking us up to do work for them. So we have to set task
+                 * state already here so that we don't miss wakeups coming
+                 * after we verify some condition.
+                 */
                set_current_state(TASK_INTERRUPTIBLE);
                list_for_each_entry(bdi, &bdi_list, bdi_list) {
@@ -446,9 +473,11 @@ static int bdi_forker_thread(void *ptr)
                        if (IS_ERR(task)) {
                                /*
                                 * If thread creation fails, force writeout of
-                                 * the bdi from the thread.
+                                 * the bdi from the thread. Hopefully 1024 is
+                                 * large enough for efficient IO.
                                 */
-                                bdi_flush_io(bdi);
+                                writeback_inodes_wb(&bdi->wb, 1024,
+                                                    WB_REASON_FORKER_THREAD);
                        } else {
                                /*
                                 * The spinlock makes sure we do not lose
@@ -461,11 +490,13 @@ static int bdi_forker_thread(void *ptr)
                                spin_unlock_bh(&bdi->wb_lock);
                                wake_up_process(task);
                        }
+                        bdi_clear_pending(bdi);
                        break;
                case KILL_THREAD:
                        __set_current_state(TASK_RUNNING);
                        kthread_stop(task);
+                        bdi_clear_pending(bdi);
                        break;
                case NO_ACTION:
@@ -481,16 +512,8 @@ static int bdi_forker_thread(void *ptr)
                        else
                                schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
                        try_to_freeze();
-                        /* Back to the main loop */
+                        break;
-                        continue;
                }
-                /*
-                 * Clear pending bit and wakeup anybody waiting to tear us down.
-                 */
-                clear_bit(BDI_pending, &bdi->state);
-                smp_mb__after_clear_bit();
-                wake_up_bit(&bdi->state, BDI_pending);
        }
        return 0;
@@ -505,7 +528,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
        list_del_rcu(&bdi->bdi_list);
        spin_unlock_bh(&bdi_lock);
-        synchronize_rcu();
+        synchronize_rcu_expedited();
 }
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -606,6 +629,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
 void bdi_unregister(struct backing_dev_info *bdi)
 {
        if (bdi->dev) {
+                bdi_set_min_ratio(bdi, 0);
                trace_writeback_bdi_unregister(bdi);
                bdi_prune_sb(bdi);
                del_timer_sync(&bdi->wb.wakeup_timer);
@@ -628,9 +652,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
        INIT_LIST_HEAD(&wb->b_dirty);
        INIT_LIST_HEAD(&wb->b_io);
        INIT_LIST_HEAD(&wb->b_more_io);
+        spin_lock_init(&wb->list_lock);
        setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
 }
+/*
+ * Initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW         (100 << (20 - PAGE_SHIFT))
 int bdi_init(struct backing_dev_info *bdi)
 {
        int i, err;
@@ -653,6 +683,15 @@ int bdi_init(struct backing_dev_info *bdi)
        }
        bdi->dirty_exceeded = 0;
+        bdi->bw_time_stamp = jiffies;
+        bdi->written_stamp = 0;
+        bdi->balanced_dirty_ratelimit = INIT_BW;
+        bdi->dirty_ratelimit = INIT_BW;
+        bdi->write_bandwidth = INIT_BW;
+        bdi->avg_write_bandwidth = INIT_BW;
        err = prop_local_init_percpu(&bdi->completions);
        if (err) {
@@ -676,15 +715,24 @@ void bdi_destroy(struct backing_dev_info *bdi)
        if (bdi_has_dirty_io(bdi)) {
                struct bdi_writeback *dst = &default_backing_dev_info.wb;
-                spin_lock(&inode_wb_list_lock);
+                bdi_lock_two(&bdi->wb, dst);
                list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
                list_splice(&bdi->wb.b_io, &dst->b_io);
                list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
-                spin_unlock(&inode_wb_list_lock);
+                spin_unlock(&bdi->wb.list_lock);
+                spin_unlock(&dst->list_lock);
        }
        bdi_unregister(bdi);
+        /*
+         * If bdi_unregister() had already been called earlier, the
+         * wakeup_timer could still be armed because bdi_prune_sb()
+         * can race with the bdi_wakeup_thread_delayed() calls from
+         * __mark_inode_dirty().
+         */
+        del_timer_sync(&bdi->wb.wakeup_timer);
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
                percpu_counter_destroy(&bdi->bdi_stat[i]);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 01d5a4b3dd0..9686c4e3f80 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -768,14 +768,13 @@ void * __init alloc_bootmem_section(unsigned long size,
                                    unsigned long section_nr)
 {
        bootmem_data_t *bdata;
-        unsigned long pfn, goal, limit;
+        unsigned long pfn, goal;
        pfn = section_nr_to_pfn(section_nr);
        goal = pfn << PAGE_SHIFT;
-        limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
        bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
-        return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
+        return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
 }
 #endif
diff --git a/mm/cma-best-fit.c b/mm/cma-best-fit.c
new file mode 100644
index 00000000000..24c27c89cae
--- /dev/null
+++ b/mm/cma-best-fit.c
@@ -0,0 +1,408 @@
+/*
+ * Contiguous Memory Allocator framework: Best Fit allocator
+ * Copyright (c) 2010 by Samsung Electronics.
+ * Written by Michal Nazarewicz (m.nazarewicz@samsung.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ */
+#define pr_fmt(fmt) "cma: bf: " fmt
+#ifdef CONFIG_CMA_DEBUG
+#  define DEBUG
+#endif
+#include <linux/errno.h>       /* Error numbers */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/cma.h>         /* CMA structures */
+/************************* Data Types *************************/
+struct cma_bf_item {
+        struct cma_chunk ch;
+        struct rb_node by_size;
+};
+struct cma_bf_private {
+        struct rb_root by_start_root;
+        struct rb_root by_size_root;
+};
+/************************* Prototypes *************************/
+/*
+ * Those are only for holes.  They must be called whenever hole's
+ * properties change but also whenever chunk becomes a hole or hole
+ * becames a chunk.
+ */
+static void __cma_bf_hole_insert_by_size(struct cma_bf_item *item);
+static void __cma_bf_hole_erase_by_size(struct cma_bf_item *item);
+static int  __must_check
+__cma_bf_hole_insert_by_start(struct cma_bf_item *item);
+static void __cma_bf_hole_erase_by_start(struct cma_bf_item *item);
+/**
+ * __cma_bf_hole_take - takes a chunk of memory out of a hole.
+ * @hole:       hole to take chunk from
+ * @size:       chunk's size
+ * @alignment:  chunk's starting address alignment (must be power of two)
+ *
+ * Takes a @size bytes large chunk from hole @hole which must be able
+ * to hold the chunk.  The "must be able" includes also alignment
+ * constraint.
+ *
+ * Returns allocated item or NULL on error (if kmalloc() failed).
+ */
+static struct cma_bf_item *__must_check
+__cma_bf_hole_take(struct cma_bf_item *hole, size_t size, dma_addr_t alignment);
+/**
+ * __cma_bf_hole_merge_maybe - tries to merge hole with neighbours.
+ * @item: hole to try and merge
+ *
+ * Which items are preserved is undefined so you may not rely on it.
+ */
+static void __cma_bf_hole_merge_maybe(struct cma_bf_item *item);
+/************************* Device API *************************/
+int cma_bf_init(struct cma_region *reg)
+{
+        struct cma_bf_private *prv;
+        struct cma_bf_item *item;
+        prv = kzalloc(sizeof *prv, GFP_KERNEL);
+        if (unlikely(!prv))
+                return -ENOMEM;
+        item = kzalloc(sizeof *item, GFP_KERNEL);
+        if (unlikely(!item)) {
+                kfree(prv);
+                return -ENOMEM;
+        }
+        item->ch.start = reg->start;
+        item->ch.size  = reg->size;
+        item->ch.reg   = reg;
+        rb_root_init(&prv->by_start_root, &item->ch.by_start);
+        rb_root_init(&prv->by_size_root, &item->by_size);
+        reg->private_data = prv;
+        return 0;
+}
+void cma_bf_cleanup(struct cma_region *reg)
+{
+        struct cma_bf_private *prv = reg->private_data;
+        struct cma_bf_item *item =
+                rb_entry(prv->by_size_root.rb_node,
+                         struct cma_bf_item, by_size);
+        /* We can assume there is only a single hole in the tree. */
+        WARN_ON(item->by_size.rb_left || item->by_size.rb_right ||
+                item->ch.by_start.rb_left || item->ch.by_start.rb_right);
+        kfree(item);
+        kfree(prv);
+}
+struct cma_chunk *cma_bf_alloc(struct cma_region *reg,
+                               size_t size, dma_addr_t alignment)
+{
+        struct cma_bf_private *prv = reg->private_data;
+        struct rb_node *node = prv->by_size_root.rb_node;
+        struct cma_bf_item *item = NULL;
+        /* First find hole that is large enough */
+        while (node) {
+                struct cma_bf_item *i =
+                        rb_entry(node, struct cma_bf_item, by_size);
+                if (i->ch.size < size) {
+                        node = node->rb_right;
+                } else if (i->ch.size >= size) {
+                        node = node->rb_left;
+                        item = i;
+                }
+        }
+        if (!item)
+                return NULL;
+        /* Now look for items which can satisfy alignment requirements */
+        node = &item->by_size;
+        for (;;) {
+                dma_addr_t start = ALIGN(item->ch.start, alignment);
+                dma_addr_t end   = item->ch.start + item->ch.size;
+                if (start < end && end - start >= size) {
+                        item = __cma_bf_hole_take(item, size, alignment);
+                        return likely(item) ? &item->ch : NULL;
+                }
+                node = rb_next(node);
+                if (!node)
+                        return NULL;
+                item  = rb_entry(node, struct cma_bf_item, by_size);
+        }
+}
+void cma_bf_free(struct cma_chunk *chunk)
+{
+        struct cma_bf_item *item = container_of(chunk, struct cma_bf_item, ch);
+        /* Add new hole */
+        if (unlikely(__cma_bf_hole_insert_by_start(item))) {
+                /*
+                 * We're screwed...  Just free the item and forget
+                 * about it.  Things are broken beyond repair so no
+                 * sense in trying to recover.
+                 */
+                kfree(item);
+        } else {
+                __cma_bf_hole_insert_by_size(item);
+                /* Merge with prev and next sibling */
+                __cma_bf_hole_merge_maybe(item);
+        }
+}
+/************************* Basic Tree Manipulation *************************/
+static void __cma_bf_hole_insert_by_size(struct cma_bf_item *item)
+{
+        struct cma_bf_private *prv = item->ch.reg->private_data;
+        struct rb_node **link = &prv->by_size_root.rb_node, *parent = NULL;
+        const typeof(item->ch.size) value = item->ch.size;
+        while (*link) {
+                struct cma_bf_item *i;
+                parent = *link;
+                i = rb_entry(parent, struct cma_bf_item, by_size);
+                link = value <= i->ch.size
+                        ? &parent->rb_left
+                        : &parent->rb_right;
+        }
+        rb_link_node(&item->by_size, parent, link);
+        rb_insert_color(&item->by_size, &prv->by_size_root);
+}
+static void __cma_bf_hole_erase_by_size(struct cma_bf_item *item)
+{
+        struct cma_bf_private *prv = item->ch.reg->private_data;
+        rb_erase(&item->by_size, &prv->by_size_root);
+}
+static int  __must_check
+__cma_bf_hole_insert_by_start(struct cma_bf_item *item)
+{
+        struct cma_bf_private *prv = item->ch.reg->private_data;
+        struct rb_node **link = &prv->by_start_root.rb_node, *parent = NULL;
+        const typeof(item->ch.start) value = item->ch.start;
+        while (*link) {
+                struct cma_bf_item *i;
+                parent = *link;
+                i = rb_entry(parent, struct cma_bf_item, ch.by_start);
+                if (WARN_ON(value == i->ch.start))
+                        /*
+                         * This should *never* happen.  And I mean
+                         * *never*.  We could even BUG on it but
+                         * hopefully things are only a bit broken,
+                         * ie. system can still run.  We produce
+                         * a warning and return an error.
+                         */
+                        return -EBUSY;
+                link = value <= i->ch.start
+                        ? &parent->rb_left
+                        : &parent->rb_right;
+        }
+        rb_link_node(&item->ch.by_start, parent, link);
+        rb_insert_color(&item->ch.by_start, &prv->by_start_root);
+        return 0;
+}
+static void __cma_bf_hole_erase_by_start(struct cma_bf_item *item)
+{
+        struct cma_bf_private *prv = item->ch.reg->private_data;
+        rb_erase(&item->ch.by_start, &prv->by_start_root);
+}
+/************************* More Tree Manipulation *************************/
+static struct cma_bf_item *__must_check
+__cma_bf_hole_take(struct cma_bf_item *hole, size_t size, size_t alignment)
+{
+        struct cma_bf_item *item;
+        /*
+         * There are three cases:
+         * 1. the chunk takes the whole hole,
+         * 2. the chunk is at the beginning or at the end of the hole, or
+         * 3. the chunk is in the middle of the hole.
+         */
+        /* Case 1, the whole hole */
+        if (size == hole->ch.size) {
+                __cma_bf_hole_erase_by_size(hole);
+                __cma_bf_hole_erase_by_start(hole);
+                return hole;
+        }
+        /* Allocate */
+        item = kmalloc(sizeof *item, GFP_KERNEL);
+        if (unlikely(!item))
+                return NULL;
+        item->ch.start = ALIGN(hole->ch.start, alignment);
+        item->ch.size  = size;
+        /* Case 3, in the middle */
+        if (item->ch.start != hole->ch.start
+         && item->ch.start + item->ch.size !=
+            hole->ch.start + hole->ch.size) {
+                struct cma_bf_item *tail;
+                /*
+                 * Space between the end of the chunk and the end of
+                 * the region, ie. space left after the end of the
+                 * chunk.  If this is dividable by alignment we can
+                 * move the chunk to the end of the hole.
+                 */
+                size_t left =
+                        hole->ch.start + hole->ch.size -
+                        (item->ch.start + item->ch.size);
+                if (left % alignment == 0) {
+                        item->ch.start += left;
+                        goto case_2;
+                }
+                /*
+                 * We are going to add a hole at the end.  This way,
+                 * we will reduce the problem to case 2 -- the chunk
+                 * will be at the end of the hole.
+                 */
+                tail = kmalloc(sizeof *tail, GFP_KERNEL);
+                if (unlikely(!tail)) {
+                        kfree(item);
+                        return NULL;
+                }
+                tail->ch.start = item->ch.start + item->ch.size;
+                tail->ch.size  =
+                        hole->ch.start + hole->ch.size - tail->ch.start;
+                tail->ch.reg   = hole->ch.reg;
+                if (unlikely(__cma_bf_hole_insert_by_start(tail))) {
+                        /*
+                         * Things are broken beyond repair...  Abort
+                         * inserting the hole but still continue with
+                         * allocation (seems like the best we can do).
+                         */
+                        hole->ch.size = tail->ch.start - hole->ch.start;
+                        kfree(tail);
+                } else {
+                        __cma_bf_hole_insert_by_size(tail);
+                        /*
+                         * It's important that we first insert the new
+                         * hole in the tree sorted by size and later
+                         * reduce the size of the old hole.  We will
+                         * update the position of the old hole in the
+                         * rb tree in code that handles case 2.
+                         */
+                        hole->ch.size = tail->ch.start - hole->ch.start;
+                }
+                /* Go to case 2 */
+        }
+        /* Case 2, at the beginning or at the end */
+case_2:
+        /* No need to update the tree; order preserved. */
+        if (item->ch.start == hole->ch.start)
+                hole->ch.start += item->ch.size;
+        /* Alter hole's size */
+        hole->ch.size -= size;
+        __cma_bf_hole_erase_by_size(hole);
+        __cma_bf_hole_insert_by_size(hole);
+        return item;
+}
+static void __cma_bf_hole_merge_maybe(struct cma_bf_item *item)
+{
+        struct cma_bf_item *prev;
+        struct rb_node *node;
+        int twice = 2;
+        node = rb_prev(&item->ch.by_start);
+        if (unlikely(!node))
+                goto next;
+        prev = rb_entry(node, struct cma_bf_item, ch.by_start);
+        for (;;) {
+                if (prev->ch.start + prev->ch.size == item->ch.start) {
+                        /* Remove previous hole from trees */
+                        __cma_bf_hole_erase_by_size(prev);
+                        __cma_bf_hole_erase_by_start(prev);
+                        /* Alter this hole */
+                        item->ch.size += prev->ch.size;
+                        item->ch.start = prev->ch.start;
+                        __cma_bf_hole_erase_by_size(item);
+                        __cma_bf_hole_insert_by_size(item);
+                        /*
+                         * No need to update by start trees as we do
+                         * not break sequence order
+                         */
+                        /* Free prev hole */
+                        kfree(prev);
+                }
+next:
+                if (!--twice)
+                        break;
+                node = rb_next(&item->ch.by_start);
+                if (unlikely(!node))
+                        break;
+                prev = item;
+                item = rb_entry(node, struct cma_bf_item, ch.by_start);
+        }
+}
+/************************* Register *************************/
+static int cma_bf_module_init(void)
+{
+        static struct cma_allocator alloc = {
+                .name    = "bf",
+                .init    = cma_bf_init,
+                .cleanup = cma_bf_cleanup,
+                .alloc   = cma_bf_alloc,
+                .free    = cma_bf_free,
+        };
+        return cma_allocator_register(&alloc);
+}
+module_init(cma_bf_module_init);
diff --git a/mm/cma.c b/mm/cma.c
new file mode 100644
index 00000000000..546dd861bdb
--- /dev/null
+++ b/mm/cma.c
@@ -0,0 +1,1413 @@
+/*
+ * Contiguous Memory Allocator framework
+ * Copyright (c) 2010 by Samsung Electronics.
+ * Written by Michal Nazarewicz (m.nazarewicz@samsung.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ */
+/*
+ * See Documentation/contiguous-memory.txt for details.
+ */
+#define pr_fmt(fmt) "cma: " fmt
+#ifdef CONFIG_CMA_DEBUG
+#  define DEBUG
+#endif
+#ifndef CONFIG_NO_BOOTMEM
+#  include <linux/bootmem.h>   /* alloc_bootmem_pages_nopanic() */
+#endif
+#ifdef CONFIG_HAVE_MEMBLOCK
+#  include <linux/memblock.h>  /* memblock*() */
+#endif
+#include <linux/device.h>      /* struct device, dev_name() */
+#include <linux/errno.h>       /* Error numbers */
+#include <linux/err.h>         /* IS_ERR, PTR_ERR, etc. */
+#include <linux/mm.h>          /* PAGE_ALIGN() */
+#include <linux/module.h>      /* EXPORT_SYMBOL_GPL() */
+#include <linux/mutex.h>       /* mutex */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/string.h>      /* str*() */
+#include <linux/cma.h>
+#include <linux/vmalloc.h>
+/*
+ * Protects cma_regions, cma_allocators, cma_map, cma_map_length,
+ * cma_kobj, cma_sysfs_regions and cma_chunks_by_start.
+ */
+static DEFINE_MUTEX(cma_mutex);
+/************************* Map attribute *************************/
+static const char *cma_map;
+static size_t cma_map_length;
+/*
+ * map-attr      ::= [ rules [ ';' ] ]
+ * rules         ::= rule [ ';' rules ]
+ * rule          ::= patterns '=' regions
+ * patterns      ::= pattern [ ',' patterns ]
+ * regions       ::= REG-NAME [ ',' regions ]
+ * pattern       ::= dev-pattern [ '/' TYPE-NAME ] | '/' TYPE-NAME
+ *
+ * See Documentation/contiguous-memory.txt for details.
+ */
+static ssize_t cma_map_validate(const char *param)
+{
+        const char *ch = param;
+        if (*ch == '\0' || *ch == '\n')
+                return 0;
+        for (;;) {
+                const char *start = ch;
+                while (*ch && *ch != '\n' && *ch != ';' && *ch != '=')
+                        ++ch;
+                if (*ch != '=' || start == ch) {
+                        pr_err("map: expecting \"<patterns>=<regions>\" near %s\n",
+                               start);
+                        return -EINVAL;
+                }
+                while (*++ch != ';')
+                        if (*ch == '\0' || *ch == '\n')
+                                return ch - param;
+                if (ch[1] == '\0' || ch[1] == '\n')
+                        return ch - param;
+                ++ch;
+        }
+}
+static int __init cma_map_param(char *param)
+{
+        ssize_t len;
+        pr_debug("param: map: %s\n", param);
+        len = cma_map_validate(param);
+        if (len < 0)
+                return len;
+        cma_map = param;
+        cma_map_length = len;
+        return 0;
+}
+#if defined CONFIG_CMA_CMDLINE
+early_param("cma.map", cma_map_param);
+#endif
+/************************* Early regions *************************/
+struct list_head cma_early_regions __initdata =
+        LIST_HEAD_INIT(cma_early_regions);
+#ifdef CONFIG_CMA_CMDLINE
+/*
+ * regions-attr ::= [ regions [ ';' ] ]
+ * regions      ::= region [ ';' regions ]
+ *
+ * region       ::= [ '-' ] reg-name
+ *                    '=' size
+ *                  [ '@' start ]
+ *                  [ '/' alignment ]
+ *                  [ ':' alloc-name ]
+ *
+ * See Documentation/contiguous-memory.txt for details.
+ *
+ * Example:
+ * cma=reg1=64M:bf;reg2=32M@0x100000:bf;reg3=64M/1M:bf
+ *
+ * If allocator is ommited the first available allocater will be used.
+ */
+#define NUMPARSE(cond_ch, type, cond) ({                                \
+                unsigned long long v = 0;                               \
+                if (*param == (cond_ch)) {                              \
+                        const char *const msg = param + 1;              \
+                        v = memparse(msg, &param);                      \
+                        if (!v || v > ~(type)0 || !(cond)) {            \
+                                pr_err("param: invalid value near %s\n", msg); \
+                                ret = -EINVAL;                          \
+                                break;                                  \
+                        }                                               \
+                }                                                       \
+                v;                                                      \
+        })
+static int __init cma_param_parse(char *param)
+{
+        static struct cma_region regions[16];
+        size_t left = ARRAY_SIZE(regions);
+        struct cma_region *reg = regions;
+        int ret = 0;
+        pr_debug("param: %s\n", param);
+        for (; *param; ++reg) {
+                dma_addr_t start, alignment;
+                size_t size;
+                if (unlikely(!--left)) {
+                        pr_err("param: too many early regions\n");
+                        return -ENOSPC;
+                }
+                /* Parse name */
+                reg->name = param;
+                param = strchr(param, '=');
+                if (!param || param == reg->name) {
+                        pr_err("param: expected \"<name>=\" near %s\n",
+                               reg->name);
+                        ret = -EINVAL;
+                        break;
+                }
+                *param = '\0';
+                /* Parse numbers */
+                size      = NUMPARSE('\0', size_t, true);
+                start     = NUMPARSE('@', dma_addr_t, true);
+                alignment = NUMPARSE('/', dma_addr_t, (v & (v - 1)) == 0);
+                alignment = max(alignment, (dma_addr_t)PAGE_SIZE);
+                start     = ALIGN(start, alignment);
+                size      = PAGE_ALIGN(size);
+                if (start + size < start) {
+                        pr_err("param: invalid start, size combination\n");
+                        ret = -EINVAL;
+                        break;
+                }
+                /* Parse allocator */
+                if (*param == ':') {
+                        reg->alloc_name = ++param;
+                        while (*param && *param != ';')
+                                ++param;
+                        if (param == reg->alloc_name)
+                                reg->alloc_name = NULL;
+                }
+                /* Go to next */
+                if (*param == ';') {
+                        *param = '\0';
+                        ++param;
+                } else if (*param) {
+                        pr_err("param: expecting ';' or end of parameter near %s\n",
+                               param);
+                        ret = -EINVAL;
+                        break;
+                }
+                /* Add */
+                reg->size      = size;
+                reg->start     = start;
+                reg->alignment = alignment;
+                reg->copy_name = 1;
+                list_add_tail(&reg->list, &cma_early_regions);
+                pr_debug("param: registering early region %s (%p@%p/%p)\n",
+                         reg->name, (void *)reg->size, (void *)reg->start,
+                         (void *)reg->alignment);
+        }
+        return ret;
+}
+early_param("cma", cma_param_parse);
+#undef NUMPARSE
+#endif
+int __init __must_check cma_early_region_register(struct cma_region *reg)
+{
+        dma_addr_t start, alignment;
+        size_t size;
+        if (reg->alignment & (reg->alignment - 1))
+                return -EINVAL;
+        alignment = max(reg->alignment, (dma_addr_t)PAGE_SIZE);
+        start     = ALIGN(reg->start, alignment);
+        size      = PAGE_ALIGN(reg->size);
+        if (start + size < start)
+                return -EINVAL;
+        reg->size      = size;
+        reg->start     = start;
+        reg->alignment = alignment;
+        list_add_tail(&reg->list, &cma_early_regions);
+        pr_debug("param: registering early region %s (%p@%p/%p)\n",
+                 reg->name, (void *)reg->size, (void *)reg->start,
+                 (void *)reg->alignment);
+        return 0;
+}
+/************************* Regions & Allocators *************************/
+static void __cma_sysfs_region_add(struct cma_region *reg);
+static int __cma_region_attach_alloc(struct cma_region *reg);
+static void __maybe_unused __cma_region_detach_alloc(struct cma_region *reg);
+/* List of all regions.  Named regions are kept before unnamed. */
+static LIST_HEAD(cma_regions);
+#define cma_foreach_region(reg) \
+        list_for_each_entry(reg, &cma_regions, list)
+int __must_check cma_region_register(struct cma_region *reg)
+{
+        const char *name, *alloc_name;
+        struct cma_region *r;
+        char *ch = NULL;
+        int ret = 0;
+        if (!reg->size || reg->start + reg->size < reg->start)
+                return -EINVAL;
+        reg->users = 0;
+        reg->used = 0;
+        reg->private_data = NULL;
+        reg->registered = 0;
+        reg->free_space = reg->size;
+        /* Copy name and alloc_name */
+        name = reg->name;
+        alloc_name = reg->alloc_name;
+        if (reg->copy_name && (reg->name || reg->alloc_name)) {
+                size_t name_size, alloc_size;
+                name_size  = reg->name       ? strlen(reg->name) + 1       : 0;
+                alloc_size = reg->alloc_name ? strlen(reg->alloc_name) + 1 : 0;
+                ch = kmalloc(name_size + alloc_size, GFP_KERNEL);
+                if (!ch) {
+                        pr_err("%s: not enough memory to allocate name\n",
+                               reg->name ?: "(private)");
+                        return -ENOMEM;
+                }
+                if (name_size) {
+                        memcpy(ch, reg->name, name_size);
+                        name = ch;
+                        ch += name_size;
+                }
+                if (alloc_size) {
+                        memcpy(ch, reg->alloc_name, alloc_size);
+                        alloc_name = ch;
+                }
+        }
+        mutex_lock(&cma_mutex);
+        /* Don't let regions overlap */
+        cma_foreach_region(r)
+                if (r->start + r->size > reg->start &&
+                    r->start < reg->start + reg->size) {
+                        ret = -EADDRINUSE;
+                        goto done;
+                }
+        if (reg->alloc) {
+                ret = __cma_region_attach_alloc(reg);
+                if (unlikely(ret < 0))
+                        goto done;
+        }
+        reg->name = name;
+        reg->alloc_name = alloc_name;
+        reg->registered = 1;
+        ch = NULL;
+        /*
+         * Keep named at the beginning and unnamed (private) at the
+         * end.  This helps in traversal when named region is looked
+         * for.
+         */
+        if (name)
+                list_add(&reg->list, &cma_regions);
+        else
+                list_add_tail(&reg->list, &cma_regions);
+        __cma_sysfs_region_add(reg);
+done:
+        mutex_unlock(&cma_mutex);
+        pr_debug("%s: region %sregistered\n",
+                 reg->name ?: "(private)", ret ? "not " : "");
+        kfree(ch);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(cma_region_register);
+static struct cma_region *__must_check
+__cma_region_find(const char **namep)
+{
+        struct cma_region *reg;
+        const char *ch, *name;
+        size_t n;
+        ch = *namep;
+        while (*ch && *ch != ',' && *ch != ';')
+                ++ch;
+        name = *namep;
+        *namep = *ch == ',' ? ch + 1 : ch;
+        n = ch - name;
+        /*
+         * Named regions are kept in front of unnamed so if we
+         * encounter unnamed region we can stop.
+         */
+        cma_foreach_region(reg)
+                if (!reg->name)
+                        break;
+                else if (!strncmp(name, reg->name, n) && !reg->name[n])
+                        return reg;
+        return NULL;
+}
+/* List of all allocators. */
+static LIST_HEAD(cma_allocators);
+#define cma_foreach_allocator(alloc) \
+        list_for_each_entry(alloc, &cma_allocators, list)
+int cma_allocator_register(struct cma_allocator *alloc)
+{
+        struct cma_region *reg;
+        int first;
+        if (!alloc->alloc || !alloc->free)
+                return -EINVAL;
+        mutex_lock(&cma_mutex);
+        first = list_empty(&cma_allocators);
+        list_add_tail(&alloc->list, &cma_allocators);
+        /*
+         * Attach this allocator to all allocator-less regions that
+         * request this particular allocator (reg->alloc_name equals
+         * alloc->name) or if region wants the first available
+         * allocator and we are the first.
+         */
+        cma_foreach_region(reg) {
+                if (reg->alloc)
+                        continue;
+                if (reg->alloc_name
+                  ? alloc->name && !strcmp(alloc->name, reg->alloc_name)
+                  : (!reg->used && first))
+                        continue;
+                reg->alloc = alloc;
+                __cma_region_attach_alloc(reg);
+        }
+        mutex_unlock(&cma_mutex);
+        pr_debug("%s: allocator registered\n", alloc->name ?: "(unnamed)");
+        return 0;
+}
+EXPORT_SYMBOL_GPL(cma_allocator_register);
+static struct cma_allocator *__must_check
+__cma_allocator_find(const char *name)
+{
+        struct cma_allocator *alloc;
+        if (!name)
+                return list_empty(&cma_allocators)
+                        ? NULL
+                        : list_entry(cma_allocators.next,
+                                     struct cma_allocator, list);
+        cma_foreach_allocator(alloc)
+                if (alloc->name && !strcmp(name, alloc->name))
+                        return alloc;
+        return NULL;
+}
+/************************* Initialise CMA *************************/
+int __init cma_set_defaults(struct cma_region *regions, const char *map)
+{
+        if (map) {
+                int ret = cma_map_param((char *)map);
+                if (unlikely(ret < 0))
+                        return ret;
+        }
+        if (!regions)
+                return 0;
+        for (; regions->size; ++regions) {
+                int ret = cma_early_region_register(regions);
+                if (unlikely(ret < 0))
+                        return ret;
+        }
+        return 0;
+}
+int __init cma_early_region_reserve(struct cma_region *reg)
+{
+        int tried = 0;
+        if (!reg->size || (reg->alignment & (reg->alignment - 1)) ||
+            reg->reserved)
+                return -EINVAL;
+#ifndef CONFIG_NO_BOOTMEM
+        tried = 1;
+        {
+                void *ptr = __alloc_bootmem_nopanic(reg->size, reg->alignment,
+                                                    reg->start);
+                if (ptr) {
+                        reg->start = virt_to_phys(ptr);
+                        reg->reserved = 1;
+                        return 0;
+                }
+        }
+#endif
+#ifdef CONFIG_HAVE_MEMBLOCK
+        tried = 1;
+        if (reg->start) {
+                if (!memblock_is_region_reserved(reg->start, reg->size) &&
+                    memblock_reserve(reg->start, reg->size) >= 0) {
+                        reg->reserved = 1;
+                        return 0;
+                }
+        } else {
+                /*
+                 * Use __memblock_alloc_base() since
+                 * memblock_alloc_base() panic()s.
+                 */
+                u64 ret = __memblock_alloc_base(reg->size, reg->alignment, 0);
+                if (ret &&
+                    ret < ~(dma_addr_t)0 &&
+                    ret + reg->size < ~(dma_addr_t)0 &&
+                    ret + reg->size > ret) {
+                        reg->start = ret;
+                        reg->reserved = 1;
+                        return 0;
+                }
+                if (ret)
+                        memblock_free(ret, reg->size);
+        }
+#endif
+        return tried ? -ENOMEM : -EOPNOTSUPP;
+}
+void __init cma_early_regions_reserve(int (*reserve)(struct cma_region *reg))
+{
+        struct cma_region *reg;
+        pr_debug("init: reserving early regions\n");
+        if (!reserve)
+                reserve = cma_early_region_reserve;
+        list_for_each_entry(reg, &cma_early_regions, list) {
+                if (reg->reserved) {
+                        /* nothing */
+                } else if (reserve(reg) >= 0) {
+                        pr_debug("init: %s: reserved %p@%p\n",
+                                 reg->name ?: "(private)",
+                                 (void *)reg->size, (void *)reg->start);
+                        reg->reserved = 1;
+                } else {
+                        pr_warn("init: %s: unable to reserve %p@%p/%p\n",
+                                reg->name ?: "(private)",
+                                (void *)reg->size, (void *)reg->start,
+                                (void *)reg->alignment);
+                }
+        }
+}
+static int __init cma_init(void)
+{
+        struct cma_region *reg, *n;
+        pr_debug("init: initialising\n");
+        if (cma_map) {
+                char *val = kmemdup(cma_map, cma_map_length + 1, GFP_KERNEL);
+                cma_map = val;
+                if (!val)
+                        return -ENOMEM;
+                val[cma_map_length] = '\0';
+        }
+        list_for_each_entry_safe(reg, n, &cma_early_regions, list) {
+                INIT_LIST_HEAD(&reg->list);
+                /*
+                 * We don't care if there was an error.  It's a pity
+                 * but there's not much we can do about it any way.
+                 * If the error is on a region that was parsed from
+                 * command line then it will stay and waste a bit of
+                 * space; if it was registered using
+                 * cma_early_region_register() it's caller's
+                 * responsibility to do something about it.
+                 */
+                if (reg->reserved && cma_region_register(reg) < 0)
+                        /* ignore error */;
+        }
+        INIT_LIST_HEAD(&cma_early_regions);
+        return 0;
+}
+/*
+ * We want to be initialised earlier than module_init/__initcall so
+ * that drivers that want to grab memory at boot time will get CMA
+ * ready.  subsys_initcall() seems early enough and not too early at
+ * the same time.
+ */
+subsys_initcall(cma_init);
+/************************* SysFS *************************/
+#if defined CONFIG_CMA_SYSFS
+static struct kobject cma_sysfs_regions;
+static int cma_sysfs_regions_ready;
+#define CMA_ATTR_INLINE(_type, _name)                                   \
+        (&((struct cma_ ## _type ## _attribute){                        \
+                .attr   = {                                             \
+                        .name   = __stringify(_name),                   \
+                        .mode   = 0644,                                 \
+                },                                                      \
+                .show   = cma_sysfs_ ## _type ## _ ## _name ## _show,   \
+                .store  = cma_sysfs_ ## _type ## _ ## _name ## _store,  \
+        }).attr)
+#define CMA_ATTR_RO_INLINE(_type, _name)                                \
+        (&((struct cma_ ## _type ## _attribute){                        \
+                .attr   = {                                             \
+                        .name   = __stringify(_name),                   \
+                        .mode   = 0444,                                 \
+                },                                                      \
+                .show   = cma_sysfs_ ## _type ## _ ## _name ## _show,   \
+        }).attr)
+struct cma_root_attribute {
+        struct attribute attr;
+        ssize_t (*show)(char *buf);
+        int (*store)(const char *buf);
+};
+static ssize_t cma_sysfs_root_map_show(char *page)
+{
+        ssize_t len;
+        len = cma_map_length;
+        if (!len) {
+                *page = 0;
+                len = 0;
+        } else {
+                if (len > (size_t)PAGE_SIZE - 1)
+                        len = (size_t)PAGE_SIZE - 1;
+                memcpy(page, cma_map, len);
+                page[len++] = '\n';
+        }
+        return len;
+}
+static int cma_sysfs_root_map_store(const char *page)
+{
+        ssize_t len = cma_map_validate(page);
+        char *val = NULL;
+        if (len < 0)
+                return len;
+        if (len) {
+                val = kmemdup(page, len + 1, GFP_KERNEL);
+                if (!val)
+                        return -ENOMEM;
+                val[len] = '\0';
+        }
+        kfree(cma_map);
+        cma_map = val;
+        cma_map_length = len;
+        return 0;
+}
+static ssize_t cma_sysfs_root_allocators_show(char *page)
+{
+        struct cma_allocator *alloc;
+        size_t left = PAGE_SIZE;
+        char *ch = page;
+        cma_foreach_allocator(alloc) {
+                ssize_t l = snprintf(ch, left, "%s ", alloc->name ?: "-");
+                ch   += l;
+                left -= l;
+        }
+        if (ch != page)
+                ch[-1] = '\n';
+        return ch - page;
+}
+static ssize_t
+cma_sysfs_root_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+        struct cma_root_attribute *rattr =
+                container_of(attr, struct cma_root_attribute, attr);
+        ssize_t ret;
+        mutex_lock(&cma_mutex);
+        ret = rattr->show(buf);
+        mutex_unlock(&cma_mutex);
+        return ret;
+}
+static ssize_t
+cma_sysfs_root_store(struct kobject *kobj, struct attribute *attr,
+                       const char *buf, size_t count)
+{
+        struct cma_root_attribute *rattr =
+                container_of(attr, struct cma_root_attribute, attr);
+        int ret;
+        mutex_lock(&cma_mutex);
+        ret = rattr->store(buf);
+        mutex_unlock(&cma_mutex);
+        return ret < 0 ? ret : count;
+}
+static struct kobj_type cma_sysfs_root_type = {
+        .sysfs_ops      = &(const struct sysfs_ops){
+                .show   = cma_sysfs_root_show,
+                .store  = cma_sysfs_root_store,
+        },
+        .default_attrs  = (struct attribute * []) {
+                CMA_ATTR_INLINE(root, map),
+                CMA_ATTR_RO_INLINE(root, allocators),
+                NULL
+        },
+};
+static int __init cma_sysfs_init(void)
+{
+        static struct kobject root;
+        static struct kobj_type fake_type;
+        struct cma_region *reg;
+        int ret;
+        /* Root */
+        ret = kobject_init_and_add(&root, &cma_sysfs_root_type,
+                                   mm_kobj, "contiguous");
+        if (unlikely(ret < 0)) {
+                pr_err("init: unable to add root kobject: %d\n", ret);
+                return ret;
+        }
+        /* Regions */
+        ret = kobject_init_and_add(&cma_sysfs_regions, &fake_type,
+                                   &root, "regions");
+        if (unlikely(ret < 0)) {
+                pr_err("init: unable to add regions kobject: %d\n", ret);
+                return ret;
+        }
+        mutex_lock(&cma_mutex);
+        cma_sysfs_regions_ready = 1;
+        cma_foreach_region(reg)
+                __cma_sysfs_region_add(reg);
+        mutex_unlock(&cma_mutex);
+        return 0;
+}
+device_initcall(cma_sysfs_init);
+struct cma_region_attribute {
+        struct attribute attr;
+        ssize_t (*show)(struct cma_region *reg, char *buf);
+        int (*store)(struct cma_region *reg, const char *buf);
+};
+static ssize_t cma_sysfs_region_name_show(struct cma_region *reg, char *page)
+{
+        return reg->name ? snprintf(page, PAGE_SIZE, "%s\n", reg->name) : 0;
+}
+static ssize_t cma_sysfs_region_start_show(struct cma_region *reg, char *page)
+{
+        return snprintf(page, PAGE_SIZE, "%p\n", (void *)reg->start);
+}
+static ssize_t cma_sysfs_region_size_show(struct cma_region *reg, char *page)
+{
+        return snprintf(page, PAGE_SIZE, "%zu\n", reg->size);
+}
+static ssize_t cma_sysfs_region_free_show(struct cma_region *reg, char *page)
+{
+        return snprintf(page, PAGE_SIZE, "%zu\n", reg->free_space);
+}
+static ssize_t cma_sysfs_region_users_show(struct cma_region *reg, char *page)
+{
+        return snprintf(page, PAGE_SIZE, "%u\n", reg->users);
+}
+static ssize_t cma_sysfs_region_alloc_show(struct cma_region *reg, char *page)
+{
+        if (reg->alloc)
+                return snprintf(page, PAGE_SIZE, "%s\n",
+                                reg->alloc->name ?: "-");
+        else if (reg->alloc_name)
+                return snprintf(page, PAGE_SIZE, "[%s]\n", reg->alloc_name);
+        else
+                return 0;
+}
+static int
+cma_sysfs_region_alloc_store(struct cma_region *reg, const char *page)
+{
+        char *s;
+        if (reg->alloc && reg->users)
+                return -EBUSY;
+        if (!*page || *page == '\n') {
+                s = NULL;
+        } else {
+                size_t len;
+                for (s = (char *)page; *++s && *s != '\n'; )
+                        /* nop */;
+                len = s - page;
+                s = kmemdup(page, len + 1, GFP_KERNEL);
+                if (!s)
+                        return -ENOMEM;
+                s[len] = '\0';
+        }
+        if (reg->alloc)
+                __cma_region_detach_alloc(reg);
+        if (reg->free_alloc_name)
+                kfree(reg->alloc_name);
+        reg->alloc_name = s;
+        reg->free_alloc_name = !!s;
+        return 0;
+}
+static ssize_t
+cma_sysfs_region_show(struct kobject *kobj, struct attribute *attr,
+                      char *buf)
+{
+        struct cma_region *reg = container_of(kobj, struct cma_region, kobj);
+        struct cma_region_attribute *rattr =
+                container_of(attr, struct cma_region_attribute, attr);
+        ssize_t ret;
+        mutex_lock(&cma_mutex);
+        ret = rattr->show(reg, buf);
+        mutex_unlock(&cma_mutex);
+        return ret;
+}
+static int
+cma_sysfs_region_store(struct kobject *kobj, struct attribute *attr,
+                       const char *buf, size_t count)
+{
+        struct cma_region *reg = container_of(kobj, struct cma_region, kobj);
+        struct cma_region_attribute *rattr =
+                container_of(attr, struct cma_region_attribute, attr);
+        int ret;
+        mutex_lock(&cma_mutex);
+        ret = rattr->store(reg, buf);
+        mutex_unlock(&cma_mutex);
+        return ret < 0 ? ret : count;
+}
+static struct kobj_type cma_sysfs_region_type = {
+        .sysfs_ops      = &(const struct sysfs_ops){
+                .show   = cma_sysfs_region_show,
+                .store  = cma_sysfs_region_store,
+        },
+        .default_attrs  = (struct attribute * []) {
+                CMA_ATTR_RO_INLINE(region, name),
+                CMA_ATTR_RO_INLINE(region, start),
+                CMA_ATTR_RO_INLINE(region, size),
+                CMA_ATTR_RO_INLINE(region, free),
+                CMA_ATTR_RO_INLINE(region, users),
+                CMA_ATTR_INLINE(region, alloc),
+                NULL
+        },
+};
+static void __cma_sysfs_region_add(struct cma_region *reg)
+{
+        int ret;
+        if (!cma_sysfs_regions_ready)
+                return;
+        memset(&reg->kobj, 0, sizeof reg->kobj);
+        ret = kobject_init_and_add(&reg->kobj, &cma_sysfs_region_type,
+                                   &cma_sysfs_regions,
+                                   "%p", (void *)reg->start);
+        if (reg->name &&
+            sysfs_create_link(&cma_sysfs_regions, &reg->kobj, reg->name) < 0)
+                /* Ignore any errors. */;
+}
+#else
+static void __cma_sysfs_region_add(struct cma_region *reg)
+{
+        /* nop */
+}
+#endif
+/************************* Chunks *************************/
+/* All chunks sorted by start address. */
+static struct rb_root cma_chunks_by_start;
+static struct cma_chunk *__must_check __cma_chunk_find(dma_addr_t addr)
+{
+        struct cma_chunk *chunk;
+        struct rb_node *n;
+        for (n = cma_chunks_by_start.rb_node; n; ) {
+                chunk = rb_entry(n, struct cma_chunk, by_start);
+                if (addr < chunk->start)
+                        n = n->rb_left;
+                else if (addr > chunk->start)
+                        n = n->rb_right;
+                else
+                        return chunk;
+        }
+        WARN(1, KERN_WARNING "no chunk starting at %p\n", (void *)addr);
+        return NULL;
+}
+static int __must_check __cma_chunk_insert(struct cma_chunk *chunk)
+{
+        struct rb_node **new, *parent = NULL;
+        typeof(chunk->start) addr = chunk->start;
+        for (new = &cma_chunks_by_start.rb_node; *new; ) {
+                struct cma_chunk *c =
+                        container_of(*new, struct cma_chunk, by_start);
+                parent = *new;
+                if (addr < c->start) {
+                        new = &(*new)->rb_left;
+                } else if (addr > c->start) {
+                        new = &(*new)->rb_right;
+                } else {
+                        /*
+                         * We should never be here.  If we are it
+                         * means allocator gave us an invalid chunk
+                         * (one that has already been allocated) so we
+                         * refuse to accept it.  Our caller will
+                         * recover by freeing the chunk.
+                         */
+                        WARN_ON(1);
+                        return -EADDRINUSE;
+                }
+        }
+        rb_link_node(&chunk->by_start, parent, new);
+        rb_insert_color(&chunk->by_start, &cma_chunks_by_start);
+        return 0;
+}
+static void __cma_chunk_free(struct cma_chunk *chunk)
+{
+        rb_erase(&chunk->by_start, &cma_chunks_by_start);
+        chunk->reg->free_space += chunk->size;
+        --chunk->reg->users;
+        chunk->reg->alloc->free(chunk);
+}
+/************************* The Device API *************************/
+static const char *__must_check
+__cma_where_from(const struct device *dev, const char *type);
+/* Allocate. */
+static dma_addr_t __must_check
+__cma_alloc_from_region(struct cma_region *reg,
+                        size_t size, dma_addr_t alignment)
+{
+        struct cma_chunk *chunk;
+        pr_debug("allocate %p/%p from %s\n",
+                 (void *)size, (void *)alignment,
+                 reg ? reg->name ?: "(private)" : "(null)");
+        if (!reg || reg->free_space < size)
+                return -ENOMEM;
+        if (!reg->alloc) {
+                if (!reg->used)
+                        __cma_region_attach_alloc(reg);
+                if (!reg->alloc)
+                        return -ENOMEM;
+        }
+        chunk = reg->alloc->alloc(reg, size, alignment);
+        if (!chunk)
+                return -ENOMEM;
+        if (unlikely(__cma_chunk_insert(chunk) < 0)) {
+                /* We should *never* be here. */
+                chunk->reg->alloc->free(chunk);
+                kfree(chunk);
+                return -EADDRINUSE;
+        }
+        chunk->reg = reg;
+        ++reg->users;
+        reg->free_space -= chunk->size;
+        pr_debug("allocated at %p\n", (void *)chunk->start);
+        return chunk->start;
+}
+dma_addr_t __must_check
+cma_alloc_from_region(struct cma_region *reg,
+                      size_t size, dma_addr_t alignment)
+{
+        dma_addr_t addr;
+        pr_debug("allocate %p/%p from %s\n",
+                 (void *)size, (void *)alignment,
+                 reg ? reg->name ?: "(private)" : "(null)");
+        if (!size || alignment & (alignment - 1) || !reg)
+                return -EINVAL;
+        mutex_lock(&cma_mutex);
+        addr = reg->registered ?
+                __cma_alloc_from_region(reg, PAGE_ALIGN(size),
+                                        max(alignment, (dma_addr_t)PAGE_SIZE)) :
+                -EINVAL;
+        mutex_unlock(&cma_mutex);
+        return addr;
+}
+EXPORT_SYMBOL_GPL(cma_alloc_from_region);
+dma_addr_t __must_check
+__cma_alloc(const struct device *dev, const char *type,
+            dma_addr_t size, dma_addr_t alignment)
+{
+        struct cma_region *reg;
+        const char *from;
+        dma_addr_t addr;
+        if (dev)
+                pr_debug("allocate %p/%p for %s/%s\n",
+                         (void *)size, (void *)alignment,
+                         dev_name(dev), type ?: "");
+        if (!size || (alignment & ~alignment))
+                return -EINVAL;
+        if (alignment < PAGE_SIZE)
+                alignment = PAGE_SIZE;
+        if (!IS_ALIGNED(size, alignment))
+                size = ALIGN(size, alignment);
+        mutex_lock(&cma_mutex);
+        from = __cma_where_from(dev, type);
+        if (unlikely(IS_ERR(from))) {
+                addr = PTR_ERR(from);
+                goto done;
+        }
+        pr_debug("allocate %p/%p from one of %s\n",
+                 (void *)size, (void *)alignment, from);
+        while (*from && *from != ';') {
+                reg = __cma_region_find(&from);
+                addr = __cma_alloc_from_region(reg, size, alignment);
+                if (!IS_ERR_VALUE(addr))
+                        goto done;
+        }
+        pr_debug("not enough memory\n");
+        addr = -ENOMEM;
+done:
+        mutex_unlock(&cma_mutex);
+        return addr;
+}
+EXPORT_SYMBOL_GPL(__cma_alloc);
+void *cma_get_virt(dma_addr_t phys, dma_addr_t size, int noncached)
+{
+        unsigned long num_pages, i;
+        struct page **pages;
+        void *virt;
+        if (noncached) {
+                num_pages = size >> PAGE_SHIFT;
+                pages = kmalloc(num_pages * sizeof(struct page *), GFP_KERNEL);
+                if (!pages)
+                        return ERR_PTR(-ENOMEM);
+                for (i = 0; i < num_pages; i++)
+                        pages[i] = pfn_to_page((phys >> PAGE_SHIFT) + i);
+                virt = vmap(pages, num_pages, VM_MAP,
+                        pgprot_writecombine(PAGE_KERNEL));
+                if (!virt) {
+                        kfree(pages);
+                        return ERR_PTR(-ENOMEM);
+                }
+                kfree(pages);
+        } else {
+                virt = phys_to_virt((unsigned long)phys);
+        }
+        return virt;
+}
+EXPORT_SYMBOL_GPL(cma_get_virt);
+/* Query information about regions. */
+static void __cma_info_add(struct cma_info *infop, struct cma_region *reg)
+{
+        infop->total_size += reg->size;
+        infop->free_size += reg->free_space;
+        if (infop->lower_bound > reg->start)
+                infop->lower_bound = reg->start;
+        if (infop->upper_bound < reg->start + reg->size)
+                infop->upper_bound = reg->start + reg->size;
+        ++infop->count;
+}
+int
+__cma_info(struct cma_info *infop, const struct device *dev, const char *type)
+{
+        struct cma_info info = { ~(dma_addr_t)0, 0, 0, 0, 0 };
+        struct cma_region *reg;
+        const char *from;
+        int ret;
+        if (unlikely(!infop))
+                return -EINVAL;
+        mutex_lock(&cma_mutex);
+        from = __cma_where_from(dev, type);
+        if (IS_ERR(from)) {
+                ret = PTR_ERR(from);
+                info.lower_bound = 0;
+                goto done;
+        }
+        while (*from && *from != ';') {
+                reg = __cma_region_find(&from);
+                if (reg)
+                        __cma_info_add(&info, reg);
+        }
+        ret = 0;
+done:
+        mutex_unlock(&cma_mutex);
+        memcpy(infop, &info, sizeof info);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(__cma_info);
+/* Freeing. */
+int cma_free(dma_addr_t addr)
+{
+        struct cma_chunk *c;
+        int ret;
+        mutex_lock(&cma_mutex);
+        c = __cma_chunk_find(addr);
+        if (c) {
+                __cma_chunk_free(c);
+                ret = 0;
+        } else {
+                ret = -ENOENT;
+        }
+        mutex_unlock(&cma_mutex);
+        if (c)
+                pr_debug("free(%p): freed\n", (void *)addr);
+        else
+                pr_err("free(%p): not found\n", (void *)addr);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(cma_free);
+/************************* Miscellaneous *************************/
+static int __cma_region_attach_alloc(struct cma_region *reg)
+{
+        struct cma_allocator *alloc;
+        int ret;
+        /*
+         * If reg->alloc is set then caller wants us to use this
+         * allocator.  Otherwise we need to find one by name.
+         */
+        if (reg->alloc) {
+                alloc = reg->alloc;
+        } else {
+                alloc = __cma_allocator_find(reg->alloc_name);
+                if (!alloc) {
+                        pr_warn("init: %s: %s: no such allocator\n",
+                                reg->name ?: "(private)",
+                                reg->alloc_name ?: "(default)");
+                        reg->used = 1;
+                        return -ENOENT;
+                }
+        }
+        /* Try to initialise the allocator. */
+        reg->private_data = NULL;
+        ret = alloc->init ? alloc->init(reg) : 0;
+        if (unlikely(ret < 0)) {
+                pr_err("init: %s: %s: unable to initialise allocator\n",
+                       reg->name ?: "(private)", alloc->name ?: "(unnamed)");
+                reg->alloc = NULL;
+                reg->used = 1;
+        } else {
+                reg->alloc = alloc;
+                pr_debug("init: %s: %s: initialised allocator\n",
+                         reg->name ?: "(private)", alloc->name ?: "(unnamed)");
+        }
+        return ret;
+}
+static void __cma_region_detach_alloc(struct cma_region *reg)
+{
+        if (!reg->alloc)
+                return;
+        if (reg->alloc->cleanup)
+                reg->alloc->cleanup(reg);
+        reg->alloc = NULL;
+        reg->used = 1;
+}
+/*
+ * s            ::= rules
+ * rules        ::= rule [ ';' rules ]
+ * rule         ::= patterns '=' regions
+ * patterns     ::= pattern [ ',' patterns ]
+ * regions      ::= REG-NAME [ ',' regions ]
+ * pattern      ::= dev-pattern [ '/' TYPE-NAME ] | '/' TYPE-NAME
+ */
+static const char *__must_check
+__cma_where_from(const struct device *dev, const char *type)
+{
+        /*
+         * This function matches the pattern from the map attribute
+         * agains given device name and type.  Type may be of course
+         * NULL or an emtpy string.
+         */
+        const char *s, *name;
+        int name_matched = 0;
+        /*
+         * If dev is NULL we were called in alternative form where
+         * type is the from string.  All we have to do is return it.
+         */
+        if (!dev)
+                return type ?: ERR_PTR(-EINVAL);
+        if (!cma_map)
+                return ERR_PTR(-ENOENT);
+        name = dev_name(dev);
+        if (WARN_ON(!name || !*name))
+                return ERR_PTR(-EINVAL);
+        if (!type)
+                type = "common";
+        /*
+         * Now we go throught the cma_map attribute.
+         */
+        for (s = cma_map; *s; ++s) {
+                const char *c;
+                /*
+                 * If the pattern starts with a slash, the device part of the
+                 * pattern matches if it matched previously.
+                 */
+                if (*s == '/') {
+                        if (!name_matched)
+                                goto look_for_next;
+                        goto match_type;
+                }
+                /*
+                 * We are now trying to match the device name.  This also
+                 * updates the name_matched variable.  If, while reading the
+                 * spec, we ecnounter comma it means that the pattern does not
+                 * match and we need to start over with another pattern (the
+                 * one afther the comma).  If we encounter equal sign we need
+                 * to start over with another rule.  If there is a character
+                 * that does not match, we neet to look for a comma (to get
+                 * another pattern) or semicolon (to get another rule) and try
+                 * again if there is one somewhere.
+                 */
+                name_matched = 0;
+                for (c = name; *s != '*' && *c; ++c, ++s)
+                        if (*s == '=')
+                                goto next_rule;
+                        else if (*s == ',')
+                                goto next_pattern;
+                        else if (*s != '?' && *c != *s)
+                                goto look_for_next;
+                if (*s == '*')
+                        ++s;
+                name_matched = 1;
+                /*
+                 * Now we need to match the type part of the pattern.  If the
+                 * pattern is missing it we match only if type points to an
+                 * empty string.  Otherwise wy try to match it just like name.
+                 */
+                if (*s == '/') {
+match_type:             /* s points to '/' */
+                        ++s;
+                        for (c = type; *s && *c; ++c, ++s)
+                                if (*s == '=')
+                                        goto next_rule;
+                                else if (*s == ',')
+                                        goto next_pattern;
+                                else if (*c != *s)
+                                        goto look_for_next;
+                }
+                /* Return the string behind the '=' sign of the rule. */
+                if (*s == '=')
+                        return s + 1;
+                else if (*s == ',')
+                        return strchr(s, '=') + 1;
+                /* Pattern did not match */
+look_for_next:
+                do {
+                        ++s;
+                } while (*s != ',' && *s != '=');
+                if (*s == ',')
+                        continue;
+next_rule:      /* s points to '=' */
+                s = strchr(s, ';');
+                if (!s)
+                        break;
+next_pattern:
+                continue;
+        }
+        return ERR_PTR(-ENOENT);
+}
diff --git a/mm/compaction.c b/mm/compaction.c
index 6cc604bd564..8ea7308601b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,10 +35,6 @@ struct compact_control {
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
        bool sync;                      /* Synchronous migration */
-        /* Account for isolated anon and file pages */
-        unsigned long nr_anon;
-        unsigned long nr_file;
        unsigned int order;             /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone,
 static void acct_isolated(struct zone *zone, struct compact_control *cc)
 {
        struct page *page;
-        unsigned int count[NR_LRU_LISTS] = { 0, };
+        unsigned int count[2] = { 0, };
-        list_for_each_entry(page, &cc->migratepages, lru) {
+        list_for_each_entry(page, &cc->migratepages, lru)
-                int lru = page_lru_base_type(page);
+                count[!!page_is_file_cache(page)]++;
-                count[lru]++;
-        }
-        cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+        __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
-        cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+        __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
-        __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
-        __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
 }
 /* Similar to reclaim, but different enough that they don't share logic */
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        unsigned long last_pageblock_nr = 0, pageblock_nr;
        unsigned long nr_scanned = 0, nr_isolated = 0;
        struct list_head *migratelist = &cc->migratepages;
+        isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
        /* Do not scan outside zone boundaries */
        low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -320,12 +313,34 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                } else if (!locked)
                        spin_lock_irq(&zone->lru_lock);
+                /*
+                 * migrate_pfn does not necessarily start aligned to a
+                 * pageblock. Ensure that pfn_valid is called when moving
+                 * into a new MAX_ORDER_NR_PAGES range in case of large
+                 * memory holes within the zone
+                 */
+                if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+                        if (!pfn_valid(low_pfn)) {
+                                low_pfn += MAX_ORDER_NR_PAGES - 1;
+                                continue;
+                        }
+                }
                if (!pfn_valid_within(low_pfn))
                        continue;
                nr_scanned++;
-                /* Get the page and skip if free */
+                /*
+                 * Get the page and ensure the page is within the same zone.
+                 * See the comment in isolate_freepages about overlapping
+                 * nodes. It is deliberate that the new zone lock is not taken
+                 * as memory compaction should not move pages between nodes.
+                 */
                page = pfn_to_page(low_pfn);
+                if (page_zone(page) != zone)
+                        continue;
+                /* Skip if free */
                if (PageBuddy(page))
                        continue;
@@ -356,8 +371,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                        continue;
                }
+                if (!cc->sync)
+                        mode |= ISOLATE_ASYNC_MIGRATE;
                /* Try isolate the page */
-                if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
+                if (__isolate_lru_page(page, mode, 0) != 0)
                        continue;
                VM_BUG_ON(PageTransCompound(page));
@@ -559,7 +577,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                nr_migrate = cc->nr_migratepages;
                err = migrate_pages(&cc->migratepages, compaction_alloc,
                                (unsigned long)cc, false,
-                                cc->sync);
+                                cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
                update_nr_listpages(cc);
                nr_remaining = cc->nr_migratepages;
@@ -574,8 +592,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                if (err) {
                        putback_lru_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
+                        if (err == -ENOMEM) {
+                                ret = COMPACT_PARTIAL;
+                                goto out;
+                        }
                }
        }
 out:
diff --git a/mm/filemap.c b/mm/filemap.c
index a8251a8d345..10481ebd96c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -396,24 +396,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 {
        int error;
-        struct mem_cgroup *memcg = NULL;
        VM_BUG_ON(!PageLocked(old));
        VM_BUG_ON(!PageLocked(new));
        VM_BUG_ON(new->mapping);
-        /*
-         * This is not page migration, but prepare_migration and
-         * end_migration does enough work for charge replacement.
-         *
-         * In the longer term we probably want a specialized function
-         * for moving the charge from old to new in a more efficient
-         * manner.
-         */
-        error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
-        if (error)
-                return error;
        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (!error) {
                struct address_space *mapping = old->mapping;
@@ -435,13 +422,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                if (PageSwapBacked(new))
                        __inc_zone_page_state(new, NR_SHMEM);
                spin_unlock_irq(&mapping->tree_lock);
+                /* mem_cgroup codes must not be called under tree_lock */
+                mem_cgroup_replace_page_cache(old, new);
                radix_tree_preload_end();
                if (freepage)
                        freepage(old);
                page_cache_release(old);
-                mem_cgroup_end_migration(memcg, old, new, true);
-        } else {
-                mem_cgroup_end_migration(memcg, old, new, false);
        }
        return error;
@@ -530,10 +516,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
        struct page *page;
        if (cpuset_do_page_mem_spread()) {
-                get_mems_allowed();
+                unsigned int cpuset_mems_cookie;
-                n = cpuset_mem_spread_node();
+                do {
-                page = alloc_pages_exact_node(n, gfp, 0);
+                        cpuset_mems_cookie = get_mems_allowed();
-                put_mems_allowed();
+                        n = cpuset_mem_spread_node();
+                        page = alloc_pages_exact_node(n, gfp, 0);
+                } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
                return page;
        }
        return alloc_pages(gfp, 0);
@@ -1393,15 +1382,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
        unsigned long seg = 0;
        size_t count;
        loff_t *ppos = &iocb->ki_pos;
-        struct blk_plug plug;
        count = 0;
        retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
        if (retval)
                return retval;
-        blk_start_plug(&plug);
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (filp->f_flags & O_DIRECT) {
                loff_t size;
@@ -1417,8 +1403,12 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        retval = filemap_write_and_wait_range(mapping, pos,
                                        pos + iov_length(iov, nr_segs) - 1);
                        if (!retval) {
+                                struct blk_plug plug;
+                                blk_start_plug(&plug);
                                retval = mapping->a_ops->direct_IO(READ, iocb,
                                                        iov, pos, nr_segs);
+                                blk_finish_plug(&plug);
                        }
                        if (retval > 0) {
                                *ppos = pos + retval;
@@ -1474,7 +1464,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        break;
        }
 out:
-        blk_finish_plug(&plug);
        return retval;
 }
 EXPORT_SYMBOL(generic_file_aio_read);
@@ -1807,7 +1796,7 @@ repeat:
                page = __page_cache_alloc(gfp | __GFP_COLD);
                if (!page)
                        return ERR_PTR(-ENOMEM);
-                err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+                err = add_to_page_cache_lru(page, mapping, index, gfp);
                if (unlikely(err)) {
                        page_cache_release(page);
                        if (err == -EEXIST)
@@ -1904,10 +1893,7 @@ static struct page *wait_on_page_read(struct page *page)
 * @gfp:        the page allocator flags to use if allocating
 *
 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
- * any new page allocations done using the specified allocation flags. Note
+ * any new page allocations done using the specified allocation flags.
- * that the Radix tree operations will still use GFP_KERNEL, so you can't
- * expect to do this atomically or anything like that - but you can pass in
- * other page requirements.
 *
 * If the page does not get brought uptodate, return -EIO.
 */
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 93356cd1282..dee94297f39 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -263,7 +263,12 @@ found:
                                                        xip_pfn);
                if (err == -ENOMEM)
                        return VM_FAULT_OOM;
-                BUG_ON(err);
+                /*
+                 * err == -EBUSY is fine, we've raced against another thread
+                 * that faulted-in the same page
+                 */
+                if (err != -EBUSY)
+                        BUG_ON(err);
                return VM_FAULT_NOPAGE;
        } else {
                int err, ret = VM_FAULT_OOM;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 81532f297fd..8cc11dda6a7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -641,6 +641,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                set_pmd_at(mm, haddr, pmd, entry);
                prepare_pmd_huge_pte(pgtable, mm);
                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+                mm->nr_ptes++;
                spin_unlock(&mm->page_table_lock);
        }
@@ -759,6 +760,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        pmd = pmd_mkold(pmd_wrprotect(pmd));
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
        prepare_pmd_huge_pte(pgtable, dst_mm);
+        dst_mm->nr_ptes++;
        ret = 0;
 out_unlock:
@@ -857,7 +859,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        }
        kfree(pages);
-        mm->nr_ptes++;
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
        page_remove_rmap(page);
@@ -989,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        VM_BUG_ON(!PageCompound(page));
        if (flags & FOLL_GET)
-                get_page(page);
+                get_page_foll(page);
 out:
        return page;
@@ -1016,6 +1017,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                        VM_BUG_ON(page_mapcount(page) < 0);
                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
                        VM_BUG_ON(!PageHead(page));
+                        tlb->mm->nr_ptes--;
                        spin_unlock(&tlb->mm->page_table_lock);
                        tlb_remove_page(tlb, page);
                        pte_free(tlb->mm, pgtable);
@@ -1156,6 +1158,7 @@ static void __split_huge_page_refcount(struct page *page)
        unsigned long head_index = page->index;
        struct zone *zone = page_zone(page);
        int zonestat;
+        int tail_count = 0;
        /* prevent PageLRU to go away from under us, and freeze lru stats */
        spin_lock_irq(&zone->lru_lock);
@@ -1164,11 +1167,27 @@ static void __split_huge_page_refcount(struct page *page)
        for (i = 1; i < HPAGE_PMD_NR; i++) {
                struct page *page_tail = page + i;
-                /* tail_page->_count cannot change */
+                /* tail_page->_mapcount cannot change */
-                atomic_sub(atomic_read(&page_tail->_count), &page->_count);
+                BUG_ON(page_mapcount(page_tail) < 0);
-                BUG_ON(page_count(page) <= 0);
+                tail_count += page_mapcount(page_tail);
-                atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+                /* check for overflow */
-                BUG_ON(atomic_read(&page_tail->_count) <= 0);
+                BUG_ON(tail_count < 0);
+                BUG_ON(atomic_read(&page_tail->_count) != 0);
+                /*
+                 * tail_page->_count is zero and not changing from
+                 * under us. But get_page_unless_zero() may be running
+                 * from under us on the tail_page. If we used
+                 * atomic_set() below instead of atomic_add(), we
+                 * would then run atomic_set() concurrently with
+                 * get_page_unless_zero(), and atomic_set() is
+                 * implemented in C not using locked ops. spin_unlock
+                 * on x86 sometime uses locked ops because of PPro
+                 * errata 66, 92, so unless somebody can guarantee
+                 * atomic_set() here would be safe on all archs (and
+                 * not only on x86), it's safer to use atomic_add().
+                 */
+                atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
+                           &page_tail->_count);
                /* after clearing PageTail the gup refcount can be released */
                smp_mb();
@@ -1186,10 +1205,7 @@ static void __split_huge_page_refcount(struct page *page)
                                      (1L << PG_uptodate)));
                page_tail->flags |= (1L << PG_dirty);
-                /*
+                /* clear PageTail before overwriting first_page */
-                 * 1) clear PageTail before overwriting first_page
-                 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
-                 */
                smp_wmb();
                /*
@@ -1206,7 +1222,6 @@ static void __split_huge_page_refcount(struct page *page)
                 * status is achieved setting a reserved bit in the
                 * pmd, not by clearing the present bit.
                */
-                BUG_ON(page_mapcount(page_tail));
                page_tail->_mapcount = page->_mapcount;
                BUG_ON(page_tail->mapping);
@@ -1223,6 +1238,8 @@ static void __split_huge_page_refcount(struct page *page)
                lru_add_page_tail(zone, page, page_tail);
        }
+        atomic_sub(tail_count, &page->_count);
+        BUG_ON(atomic_read(&page->_count) <= 0);
        __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
        __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
@@ -1295,7 +1312,6 @@ static int __split_huge_page_map(struct page *page,
                        pte_unmap(pte);
                }
-                mm->nr_ptes++;
                smp_wmb(); /* make pte visible before pmd */
                /*
                 * Up to this point the pmd is present and huge and
@@ -1910,7 +1926,6 @@ static void collapse_huge_page(struct mm_struct *mm,
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache(vma, address, entry);
        prepare_pmd_huge_pte(pgtable, mm);
-        mm->nr_ptes--;
        spin_unlock(&mm->page_table_lock);
 #ifndef CONFIG_NUMA
@@ -2005,7 +2020,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
 {
        struct mm_struct *mm = mm_slot->mm;
-        VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+        VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
        if (khugepaged_test_exit(mm)) {
                /* free mm_slot */
@@ -2033,7 +2048,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
        int progress = 0;
        VM_BUG_ON(!pages);
-        VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+        VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
        if (khugepaged_scan.mm_slot)
                mm_slot = khugepaged_scan.mm_slot;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bfcf153bc82..037f077b986 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -460,8 +460,10 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
        struct zonelist *zonelist;
        struct zone *zone;
        struct zoneref *z;
+        unsigned int cpuset_mems_cookie;
-        get_mems_allowed();
+retry_cpuset:
+        cpuset_mems_cookie = get_mems_allowed();
        zonelist = huge_zonelist(vma, address,
                                        htlb_alloc_mask, &mpol, &nodemask);
        /*
@@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
                        }
                }
        }
-err:
        mpol_cond_put(mpol);
-        put_mems_allowed();
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                goto retry_cpuset;
        return page;
+err:
+        mpol_cond_put(mpol);
+        return NULL;
 }
 static void update_and_free_page(struct hstate *h, struct page *page)
@@ -575,6 +582,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                __SetPageTail(p);
+                set_page_count(p, 0);
                p->first_page = page;
        }
 }
@@ -900,7 +908,6 @@ retry:
        h->resv_huge_pages += delta;
        ret = 0;
-        spin_unlock(&hugetlb_lock);
        /* Free the needed pages to the hugetlb pool */
        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                if ((--needed) < 0)
@@ -914,6 +921,7 @@ retry:
                VM_BUG_ON(page_count(page));
                enqueue_huge_page(h, page);
        }
+        spin_unlock(&hugetlb_lock);
        /* Free unnecessary surplus pages to the buddy allocator */
 free:
@@ -2059,6 +2067,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
                kref_get(&reservations->refs);
 }
+static void resv_map_put(struct vm_area_struct *vma)
+{
+        struct resv_map *reservations = vma_resv_map(vma);
+        if (!reservations)
+                return;
+        kref_put(&reservations->refs, resv_map_release);
+}
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
        struct hstate *h = hstate_vma(vma);
@@ -2074,7 +2091,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
                reserve = (end - start) -
                        region_count(&reservations->regions, start, end);
-                kref_put(&reservations->refs, resv_map_release);
+                resv_map_put(vma);
                if (reserve) {
                        hugetlb_acct_memory(h, -reserve);
@@ -2284,6 +2301,22 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 {
        mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
        __unmap_hugepage_range(vma, start, end, ref_page);
+        /*
+         * Clear this flag so that x86's huge_pmd_share page_table_shareable
+         * test will fail on a vma being torn down, and not grab a page table
+         * on its way out.  We're lucky that the flag has such an appropriate
+         * name, and can in fact be safely cleared here. We could clear it
+         * before the __unmap_hugepage_range above, but all that's necessary
+         * is to clear it before releasing the i_mmap_mutex below.
+         *
+         * This works because in the contexts this is called, the VMA is
+         * going to be destroyed. It is not vunerable to madvise(DONTNEED)
+         * because madvise is not supported on hugetlbfs. The same applies
+         * for direct IO. unmap_hugepage_range() is only being called just
+         * before free_pgtables() so clearing VM_MAYSHARE will not cause
+         * surprises later.
+         */
+        vma->vm_flags &= ~VM_MAYSHARE;
        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 }
@@ -2397,7 +2430,6 @@ retry_avoidcopy:
                if (outside_reserve) {
                        BUG_ON(huge_pte_none(pte));
                        if (unmap_ref_private(mm, vma, old_page, address)) {
-                                BUG_ON(page_count(old_page) != 1);
                                BUG_ON(huge_pte_none(pte));
                                spin_lock(&mm->page_table_lock);
                                goto retry_avoidcopy;
@@ -2415,6 +2447,8 @@ retry_avoidcopy:
         * anon_vma prepared.
         */
        if (unlikely(anon_vma_prepare(vma))) {
+                page_cache_release(new_page);
+                page_cache_release(old_page);
                /* Caller expects lock to be held */
                spin_lock(&mm->page_table_lock);
                return VM_FAULT_OOM;
@@ -2676,6 +2710,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * so no worry about deadlock.
         */
        page = pte_page(entry);
+        get_page(page);
        if (page != pagecache_page)
                lock_page(page);
@@ -2707,6 +2742,7 @@ out_page_table_lock:
        }
        if (page != pagecache_page)
                unlock_page(page);
+        put_page(page);
 out_mutex:
        mutex_unlock(&hugetlb_instantiation_mutex);
@@ -2833,9 +2869,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                }
        }
        spin_unlock(&mm->page_table_lock);
-        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        /*
+         * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+         * may have cleared our pud entry and done put_page on the page table:
+         * once we release i_mmap_mutex, another task can do the final put_page
+         * and that page table be reused and filled with junk.
+         */
        flush_tlb_range(vma, start, end);
+        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 }
 int hugetlb_reserve_pages(struct inode *inode,
@@ -2873,12 +2914,16 @@ int hugetlb_reserve_pages(struct inode *inode,
                set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
        }
-        if (chg < 0)
+        if (chg < 0) {
-                return chg;
+                ret = chg;
+                goto out_err;
+        }
        /* There must be enough filesystem quota for the mapping */
-        if (hugetlb_get_quota(inode->i_mapping, chg))
+        if (hugetlb_get_quota(inode->i_mapping, chg)) {
-                return -ENOSPC;
+                ret = -ENOSPC;
+                goto out_err;
+        }
        /*
         * Check enough hugepages are available for the reservation.
@@ -2887,7 +2932,7 @@ int hugetlb_reserve_pages(struct inode *inode,
        ret = hugetlb_acct_memory(h, chg);
        if (ret < 0) {
                hugetlb_put_quota(inode->i_mapping, chg);
-                return ret;
+                goto out_err;
        }
        /*
@@ -2904,6 +2949,10 @@ int hugetlb_reserve_pages(struct inode *inode,
        if (!vma || vma->vm_flags & VM_MAYSHARE)
                region_add(&inode->i_mapping->private_list, from, to);
        return 0;
+out_err:
+        if (vma)
+                resv_map_put(vma);
+        return ret;
 }
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
diff --git a/mm/internal.h b/mm/internal.h
index d071d380fb4..2189af49178 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)
        atomic_dec(&page->_count);
 }
+static inline void __get_page_tail_foll(struct page *page,
+                                        bool get_page_head)
+{
+        /*
+         * If we're getting a tail page, the elevated page->_count is
+         * required only in the head page and we will elevate the head
+         * page->_count and tail page->_mapcount.
+         *
+         * We elevate page_tail->_mapcount for tail pages to force
+         * page_tail->_count to be zero at all times to avoid getting
+         * false positives from get_page_unless_zero() with
+         * speculative page access (like in
+         * page_cache_get_speculative()) on tail pages.
+         */
+        VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+        VM_BUG_ON(atomic_read(&page->_count) != 0);
+        VM_BUG_ON(page_mapcount(page) < 0);
+        if (get_page_head)
+                atomic_inc(&page->first_page->_count);
+        atomic_inc(&page->_mapcount);
+}
+/*
+ * This is meant to be called as the FOLL_GET operation of
+ * follow_page() and it must be called while holding the proper PT
+ * lock while the pte (or pmd_trans_huge) is still mapping the page.
+ */
+static inline void get_page_foll(struct page *page)
+{
+        if (unlikely(PageTail(page)))
+                /*
+                 * This is safe only because
+                 * __split_huge_page_refcount() can't run under
+                 * get_page_foll() because we hold the proper PT lock.
+                 */
+                __get_page_tail_foll(page, true);
+        else {
+                /*
+                 * Getting a normal page or the head of a compound page
+                 * requires to already have an elevated page->_count.
+                 */
+                VM_BUG_ON(atomic_read(&page->_count) <= 0);
+                atomic_inc(&page->_count);
+        }
+}
 extern unsigned long highest_memmap_pfn;
 /*
diff --git a/mm/ksm.c b/mm/ksm.c
index 9a68b0cf0a1..bf0d59a2c7b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -184,15 +184,15 @@ static unsigned long ksm_pages_unshared;
 static unsigned long ksm_rmap_items;
 /* Number of pages ksmd should scan in one batch */
-static unsigned int ksm_thread_pages_to_scan = 100;
+static unsigned int ksm_thread_pages_to_scan = 128;
 /* Milliseconds ksmd should sleep between batches */
-static unsigned int ksm_thread_sleep_millisecs = 20;
+static unsigned int ksm_thread_sleep_millisecs = 4000;
 #define KSM_RUN_STOP    0
 #define KSM_RUN_MERGE   1
 #define KSM_RUN_UNMERGE 2
-static unsigned int ksm_run = KSM_RUN_STOP;
+static unsigned int ksm_run = KSM_RUN_MERGE;
 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
 static DEFINE_MUTEX(ksm_thread_mutex);
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491ed50..deabe5f603a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -13,6 +13,7 @@
 #include <linux/hugetlb.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
+#include <linux/file.h>
 /*
 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -197,14 +198,16 @@ static long madvise_remove(struct vm_area_struct *vma,
        struct address_space *mapping;
        loff_t offset, endoff;
        int error;
+        struct file *f;
        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
        if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
                return -EINVAL;
-        if (!vma->vm_file || !vma->vm_file->f_mapping
+        f = vma->vm_file;
-                || !vma->vm_file->f_mapping->host) {
+        if (!f || !f->f_mapping || !f->f_mapping->host) {
                        return -EINVAL;
        }
@@ -218,9 +221,16 @@ static long madvise_remove(struct vm_area_struct *vma,
        endoff = (loff_t)(end - vma->vm_start - 1)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-        /* vmtruncate_range needs to take i_mutex and i_alloc_sem */
+        /*
+         * vmtruncate_range may need to take i_mutex and i_alloc_sem.
+         * We need to explicitly grab a reference because the vma (and
+         * hence the vma's reference to the file) can go away as soon as
+         * we drop mmap_sem.
+         */
+        get_file(f);
        up_read(&current->mm->mmap_sem);
        error = vmtruncate_range(mapping->host, offset, endoff);
+        fput(f);
        down_read(&current->mm->mmap_sem);
        return error;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e013b8e57d2..57cdf5ad692 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1251,7 +1251,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
                                        struct list_head *dst,
                                        unsigned long *scanned, int order,
-                                        int mode, struct zone *z,
+                                        isolate_mode_t mode,
+                                        struct zone *z,
                                        struct mem_cgroup *mem_cont,
                                        int active, int file)
 {
@@ -1730,7 +1731,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
        /* If memsw_is_minimum==1, swap-out is of-no-use. */
-        if (!check_soft && root_mem->memsw_is_minimum)
+        if (!check_soft && !shrink && root_mem->memsw_is_minimum)
                noswap = true;
        while (1) {
@@ -3422,6 +3423,50 @@ int mem_cgroup_shmem_charge_fallback(struct page *page,
        return ret;
 }
+/*
+ * At replace page cache, newpage is not under any memcg but it's on
+ * LRU. So, this function doesn't touch res_counter but handles LRU
+ * in correct way. Both pages are locked so we cannot race with uncharge.
+ */
+void mem_cgroup_replace_page_cache(struct page *oldpage,
+                                  struct page *newpage)
+{
+        struct mem_cgroup *memcg;
+        struct page_cgroup *pc;
+        struct zone *zone;
+        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+        unsigned long flags;
+        if (mem_cgroup_disabled())
+                return;
+        pc = lookup_page_cgroup(oldpage);
+        /* fix accounting on old pages */
+        lock_page_cgroup(pc);
+        memcg = pc->mem_cgroup;
+        mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
+        ClearPageCgroupUsed(pc);
+        unlock_page_cgroup(pc);
+        if (PageSwapBacked(oldpage))
+                type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+        zone = page_zone(newpage);
+        pc = lookup_page_cgroup(newpage);
+        /*
+         * Even if newpage->mapping was NULL before starting replacement,
+         * the newpage may be on LRU(or pagevec for LRU) already. We lock
+         * LRU while we overwrite pc->mem_cgroup.
+         */
+        spin_lock_irqsave(&zone->lru_lock, flags);
+        if (PageLRU(newpage))
+                del_page_from_lru_list(zone, newpage, page_lru(newpage));
+        __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type);
+        if (PageLRU(newpage))
+                add_page_to_lru_list(zone, newpage, page_lru(newpage));
+        spin_unlock_irqrestore(&zone->lru_lock, flags);
+}
 #ifdef CONFIG_DEBUG_VM
 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
 {
@@ -4514,6 +4559,9 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
         */
        BUG_ON(!thresholds);
+        if (!thresholds->primary)
+                goto unlock;
        usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
        /* Check if a threshold crossed before removing */
@@ -4558,11 +4606,17 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
 swap_buffers:
        /* Swap primary and spare array */
        thresholds->spare = thresholds->primary;
+        /* If all events are unregistered, free the spare array */
+        if (!new) {
+                kfree(thresholds->spare);
+                thresholds->spare = NULL;
+        }
        rcu_assign_pointer(thresholds->primary, new);
        /* To be sure that nobody uses thresholds */
        synchronize_rcu();
+unlock:
        mutex_unlock(&memcg->thresholds_lock);
 }
@@ -4963,9 +5017,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                int cpu;
                enable_swap_cgroup();
                parent = NULL;
-                root_mem_cgroup = mem;
                if (mem_cgroup_soft_limit_tree_init())
                        goto free_out;
+                root_mem_cgroup = mem;
                for_each_possible_cpu(cpu) {
                        struct memcg_stock_pcp *stock =
                                                &per_cpu(memcg_stock, cpu);
@@ -5004,7 +5058,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
        return &mem->css;
 free_out:
        __mem_cgroup_free(mem);
-        root_mem_cgroup = NULL;
        return ERR_PTR(error);
 }
@@ -5244,6 +5297,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
        spinlock_t *ptl;
        split_huge_page_pmd(walk->mm, pmd);
+        if (pmd_trans_unstable(pmd))
+                return 0;
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE)
@@ -5405,6 +5460,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
        spinlock_t *ptl;
        split_huge_page_pmd(walk->mm, pmd);
+        if (pmd_trans_unstable(pmd))
+                return 0;
 retry:
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; addr += PAGE_SIZE) {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 740c4f52059..2f49dcf4f47 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1334,8 +1334,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
        /* Keep page count to indicate a given hugepage is isolated. */
        list_add(&hpage->lru, &pagelist);
-        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
+        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false,
-                                true);
+                                MIGRATE_SYNC);
        if (ret) {
                struct page *page1, *page2;
                list_for_each_entry_safe(page1, page2, &pagelist, lru)
@@ -1464,7 +1464,7 @@ int soft_offline_page(struct page *page, int flags)
                                            page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-                                                                0, true);
+                                                        false, MIGRATE_SYNC);
                if (ret) {
                        putback_lru_pages(&pagelist);
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 36e889cca24..79ff0613449 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1228,16 +1228,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*pmd)) {
-                        if (next-addr != HPAGE_PMD_SIZE) {
+                        if (next - addr != HPAGE_PMD_SIZE) {
                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
                                split_huge_page_pmd(vma->vm_mm, pmd);
                        } else if (zap_huge_pmd(tlb, vma, pmd))
-                                continue;
+                                goto next;
                        /* fall through */
                }
-                if (pmd_none_or_clear_bad(pmd))
+                /*
-                        continue;
+                 * Here there can be other concurrent MADV_DONTNEED or
+                 * trans huge page faults running, and if the pmd is
+                 * none or trans huge it can change under us. This is
+                 * because MADV_DONTNEED holds the mmap_sem in read
+                 * mode.
+                 */
+                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+                        goto next;
                next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+next:
                cond_resched();
        } while (pmd++, addr = next, addr != end);
@@ -1514,7 +1522,7 @@ split_fallthrough:
        }
        if (flags & FOLL_GET)
-                get_page(page);
+                get_page_foll(page);
        if (flags & FOLL_TOUCH) {
                if ((flags & FOLL_WRITE) &&
                    !pte_dirty(pte) && !PageDirty(page))
@@ -1816,7 +1824,63 @@ next_page:
 }
 EXPORT_SYMBOL(__get_user_pages);
-/**
+/*
+ * fixup_user_fault() - manually resolve a user page fault
+ * @tsk:        the task_struct to use for page fault accounting, or
+ *              NULL if faults are not to be recorded.
+ * @mm:         mm_struct of target mm
+ * @address:    user address
+ * @fault_flags:flags to pass down to handle_mm_fault()
+ *
+ * This is meant to be called in the specific scenario where for locking reasons
+ * we try to access user memory in atomic context (within a pagefault_disable()
+ * section), this returns -EFAULT, and we want to resolve the user fault before
+ * trying again.
+ *
+ * Typically this is meant to be used by the futex code.
+ *
+ * The main difference with get_user_pages() is that this function will
+ * unconditionally call handle_mm_fault() which will in turn perform all the
+ * necessary SW fixup of the dirty and young bits in the PTE, while
+ * handle_mm_fault() only guarantees to update these in the struct page.
+ *
+ * This is important for some architectures where those bits also gate the
+ * access permission to the page because they are maintained in software.  On
+ * such architectures, gup() will not be enough to make a subsequent access
+ * succeed.
+ *
+ * This should be called with the mm_sem held for read.
+ */
+int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+                     unsigned long address, unsigned int fault_flags)
+{
+        struct vm_area_struct *vma;
+        int ret;
+        vma = find_extend_vma(mm, address);
+        if (!vma || address < vma->vm_start)
+                return -EFAULT;
+        ret = handle_mm_fault(mm, vma, address, fault_flags);
+        if (ret & VM_FAULT_ERROR) {
+                if (ret & VM_FAULT_OOM)
+                        return -ENOMEM;
+                if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+                        return -EHWPOISON;
+                if (ret & VM_FAULT_SIGBUS)
+                        return -EFAULT;
+                BUG();
+        }
+        if (tsk) {
+                if (ret & VM_FAULT_MAJOR)
+                        tsk->maj_flt++;
+                else
+                        tsk->min_flt++;
+        }
+        return 0;
+}
+/*
 * get_user_pages() - pin user pages in memory
 * @tsk:        the task_struct to use for page fault accounting, or
 *              NULL if faults are not to be recorded.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c46887b5a11..ae5a3f21010 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -747,7 +747,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                }
                /* this function returns # of failed pages */
                ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
-                                                                true, true);
+                                                        true, MIGRATE_SYNC);
                if (ret)
                        putback_lru_pages(&source);
        }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e7fb9d25c54..cff919fe702 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -511,7 +511,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        do {
                next = pmd_addr_end(addr, end);
                split_huge_page_pmd(vma->vm_mm, pmd);
-                if (pmd_none_or_clear_bad(pmd))
+                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        continue;
                if (check_pte_range(vma, pmd, addr, next, nodes,
                                    flags, private))
@@ -606,27 +606,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
        return first;
 }
-/* Apply policy to a single VMA */
-static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
-{
-        int err = 0;
-        struct mempolicy *old = vma->vm_policy;
-        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
-                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
-                 vma->vm_ops, vma->vm_file,
-                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
-        if (vma->vm_ops && vma->vm_ops->set_policy)
-                err = vma->vm_ops->set_policy(vma, new);
-        if (!err) {
-                mpol_get(new);
-                vma->vm_policy = new;
-                mpol_put(old);
-        }
-        return err;
-}
 /* Step 2: apply policy to a range and do splits. */
 static int mbind_range(struct mm_struct *mm, unsigned long start,
                       unsigned long end, struct mempolicy *new_pol)
@@ -666,9 +645,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
                        if (err)
                                goto out;
                }
-                err = policy_vma(vma, new_pol);
-                if (err)
+                /*
-                        goto out;
+                 * Apply policy to a single VMA. The reference counting of
+                 * policy for vma_policy linkages has already been handled by
+                 * vma_merge and split_vma as necessary. If this is a shared
+                 * policy then ->set_policy will increment the reference count
+                 * for an sp node.
+                 */
+                pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+                        vma->vm_start, vma->vm_end, vma->vm_pgoff,
+                        vma->vm_ops, vma->vm_file,
+                        vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+                if (vma->vm_ops && vma->vm_ops->set_policy) {
+                        err = vma->vm_ops->set_policy(vma, new_pol);
+                        if (err)
+                                goto out;
+                }
        }
 out:
@@ -933,7 +926,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_node_page, dest,
-                                                                false, true);
+                                                        false, MIGRATE_SYNC);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1817,18 +1810,24 @@ struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr, int node)
 {
-        struct mempolicy *pol = get_vma_policy(current, vma, addr);
+        struct mempolicy *pol;
        struct zonelist *zl;
        struct page *page;
+        unsigned int cpuset_mems_cookie;
+retry_cpuset:
+        pol = get_vma_policy(current, vma, addr);
+        cpuset_mems_cookie = get_mems_allowed();
-        get_mems_allowed();
        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
                unsigned nid;
                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
                mpol_cond_put(pol);
                page = alloc_page_interleave(gfp, order, nid);
-                put_mems_allowed();
+                if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                        goto retry_cpuset;
                return page;
        }
        zl = policy_zonelist(gfp, pol, node);
@@ -1839,7 +1838,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                struct page *page =  __alloc_pages_nodemask(gfp, order,
                                                zl, policy_nodemask(gfp, pol));
                __mpol_put(pol);
-                put_mems_allowed();
+                if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                        goto retry_cpuset;
                return page;
        }
        /*
@@ -1847,7 +1847,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
         */
        page = __alloc_pages_nodemask(gfp, order, zl,
                                      policy_nodemask(gfp, pol));
-        put_mems_allowed();
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                goto retry_cpuset;
        return page;
 }
@@ -1874,11 +1875,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
        struct mempolicy *pol = current->mempolicy;
        struct page *page;
+        unsigned int cpuset_mems_cookie;
        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
                pol = &default_policy;
-        get_mems_allowed();
+retry_cpuset:
+        cpuset_mems_cookie = get_mems_allowed();
        /*
         * No reference counting needed for current->mempolicy
         * nor system default_policy
@@ -1889,7 +1893,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
                page = __alloc_pages_nodemask(gfp, order,
                                policy_zonelist(gfp, pol, numa_node_id()),
                                policy_nodemask(gfp, pol));
-        put_mems_allowed();
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                goto retry_cpuset;
        return page;
 }
 EXPORT_SYMBOL(alloc_pages_current);
diff --git a/mm/migrate.c b/mm/migrate.c
index 666e4e67741..480714b6f3f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                ptep = pte_offset_map(pmd, addr);
-                if (!is_swap_pte(*ptep)) {
+                /*
-                        pte_unmap(ptep);
+                 * Peek to check is_swap_pte() before taking ptlock?  No, we
-                        goto out;
+                 * can race mremap's move_ptes(), which skips anon_vma lock.
-                }
+                 */
                ptl = pte_lockptr(mm, pmd);
        }
@@ -220,6 +220,56 @@ out:
        pte_unmap_unlock(ptep, ptl);
 }
+#ifdef CONFIG_BLOCK
+/* Returns true if all buffers are successfully locked */
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+                                                        enum migrate_mode mode)
+{
+        struct buffer_head *bh = head;
+        /* Simple case, sync compaction */
+        if (mode != MIGRATE_ASYNC) {
+                do {
+                        get_bh(bh);
+                        lock_buffer(bh);
+                        bh = bh->b_this_page;
+                } while (bh != head);
+                return true;
+        }
+        /* async case, we cannot block on lock_buffer so use trylock_buffer */
+        do {
+                get_bh(bh);
+                if (!trylock_buffer(bh)) {
+                        /*
+                         * We failed to lock the buffer and cannot stall in
+                         * async migration. Release the taken locks
+                         */
+                        struct buffer_head *failed_bh = bh;
+                        put_bh(failed_bh);
+                        bh = head;
+                        while (bh != failed_bh) {
+                                unlock_buffer(bh);
+                                put_bh(bh);
+                                bh = bh->b_this_page;
+                        }
+                        return false;
+                }
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return true;
+}
+#else
+static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
+                                                        enum migrate_mode mode)
+{
+        return true;
+}
+#endif /* CONFIG_BLOCK */
 /*
 * Replace the page in the mapping.
 *
@@ -229,7 +279,8 @@ out:
 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
 */
 static int migrate_page_move_mapping(struct address_space *mapping,
-                struct page *newpage, struct page *page)
+                struct page *newpage, struct page *page,
+                struct buffer_head *head, enum migrate_mode mode)
 {
        int expected_count;
        void **pslot;
@@ -259,6 +310,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        }
        /*
+         * In the async migration case of moving a page with buffers, lock the
+         * buffers using trylock before the mapping is moved. If the mapping
+         * was moved, we later failed to lock the buffers and could not move
+         * the mapping back due to an elevated page count, we would have to
+         * block waiting on other references to be dropped.
+         */
+        if (mode == MIGRATE_ASYNC && head &&
+                        !buffer_migrate_lock_buffers(head, mode)) {
+                page_unfreeze_refs(page, expected_count);
+                spin_unlock_irq(&mapping->tree_lock);
+                return -EAGAIN;
+        }
+        /*
         * Now we know that no one else is looking at the page.
         */
        get_page(newpage);      /* add cache reference */
@@ -415,13 +480,14 @@ EXPORT_SYMBOL(fail_migrate_page);
 * Pages are locked upon entry and exit.
 */
 int migrate_page(struct address_space *mapping,
-                struct page *newpage, struct page *page)
+                struct page *newpage, struct page *page,
+                enum migrate_mode mode)
 {
        int rc;
        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
-        rc = migrate_page_move_mapping(mapping, newpage, page);
+        rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
        if (rc)
                return rc;
@@ -438,28 +504,28 @@ EXPORT_SYMBOL(migrate_page);
 * exist.
 */
 int buffer_migrate_page(struct address_space *mapping,
-                struct page *newpage, struct page *page)
+                struct page *newpage, struct page *page, enum migrate_mode mode)
 {
        struct buffer_head *bh, *head;
        int rc;
        if (!page_has_buffers(page))
-                return migrate_page(mapping, newpage, page);
+                return migrate_page(mapping, newpage, page, mode);
        head = page_buffers(page);
-        rc = migrate_page_move_mapping(mapping, newpage, page);
+        rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
        if (rc)
                return rc;
-        bh = head;
+        /*
-        do {
+         * In the async case, migrate_page_move_mapping locked the buffers
-                get_bh(bh);
+         * with an IRQ-safe spinlock held. In the sync case, the buffers
-                lock_buffer(bh);
+         * need to be locked now
-                bh = bh->b_this_page;
+         */
+        if (mode != MIGRATE_ASYNC)
-        } while (bh != head);
+                BUG_ON(!buffer_migrate_lock_buffers(head, mode));
        ClearPagePrivate(page);
        set_page_private(newpage, page_private(page));
@@ -536,10 +602,14 @@ static int writeout(struct address_space *mapping, struct page *page)
 * Default handling if a filesystem does not provide a migration function.
 */
 static int fallback_migrate_page(struct address_space *mapping,
-        struct page *newpage, struct page *page)
+        struct page *newpage, struct page *page, enum migrate_mode mode)
 {
-        if (PageDirty(page))
+        if (PageDirty(page)) {
+                /* Only writeback pages in full synchronous migration */
+                if (mode != MIGRATE_SYNC)
+                        return -EBUSY;
                return writeout(mapping, page);
+        }
        /*
         * Buffers may be managed in a filesystem specific way.
@@ -549,7 +619,7 @@ static int fallback_migrate_page(struct address_space *mapping,
            !try_to_release_page(page, GFP_KERNEL))
                return -EAGAIN;
-        return migrate_page(mapping, newpage, page);
+        return migrate_page(mapping, newpage, page, mode);
 }
 /*
@@ -564,7 +634,7 @@ static int fallback_migrate_page(struct address_space *mapping,
 *  == 0 - success
 */
 static int move_to_new_page(struct page *newpage, struct page *page,
-                                        int remap_swapcache, bool sync)
+                                int remap_swapcache, enum migrate_mode mode)
 {
        struct address_space *mapping;
        int rc;
@@ -585,29 +655,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
        mapping = page_mapping(page);
        if (!mapping)
-                rc = migrate_page(mapping, newpage, page);
+                rc = migrate_page(mapping, newpage, page, mode);
-        else {
+        else if (mapping->a_ops->migratepage)
                /*
-                 * Do not writeback pages if !sync and migratepage is
+                 * Most pages have a mapping and most filesystems provide a
-                 * not pointing to migrate_page() which is nonblocking
+                 * migratepage callback. Anonymous pages are part of swap
-                 * (swapcache/tmpfs uses migratepage = migrate_page).
+                 * space which also has its own migratepage callback. This
+                 * is the most common path for page migration.
                 */
-                if (PageDirty(page) && !sync &&
+                rc = mapping->a_ops->migratepage(mapping,
-                    mapping->a_ops->migratepage != migrate_page)
+                                                newpage, page, mode);
-                        rc = -EBUSY;
+        else
-                else if (mapping->a_ops->migratepage)
+                rc = fallback_migrate_page(mapping, newpage, page, mode);
-                        /*
-                         * Most pages have a mapping and most filesystems
-                         * should provide a migration function. Anonymous
-                         * pages are part of swap space which also has its
-                         * own migration function. This is the most common
-                         * path for page migration.
-                         */
-                        rc = mapping->a_ops->migratepage(mapping,
-                                                        newpage, page);
-                else
-                        rc = fallback_migrate_page(mapping, newpage, page);
-        }
        if (rc) {
                newpage->mapping = NULL;
@@ -621,38 +680,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
        return rc;
 }
-/*
+static int __unmap_and_move(struct page *page, struct page *newpage,
- * Obtain the lock on page, remove all ptes and migrate the page
+                        int force, bool offlining, enum migrate_mode mode)
- * to the newly allocated page in newpage.
- */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-                        struct page *page, int force, bool offlining, bool sync)
 {
-        int rc = 0;
+        int rc = -EAGAIN;
-        int *result = NULL;
-        struct page *newpage = get_new_page(page, private, &result);
        int remap_swapcache = 1;
        int charge = 0;
        struct mem_cgroup *mem;
        struct anon_vma *anon_vma = NULL;
-        if (!newpage)
-                return -ENOMEM;
-        if (page_count(page) == 1) {
-                /* page was freed from under us. So we are done. */
-                goto move_newpage;
-        }
-        if (unlikely(PageTransHuge(page)))
-                if (unlikely(split_huge_page(page)))
-                        goto move_newpage;
-        /* prepare cgroup just returns 0 or -ENOMEM */
-        rc = -EAGAIN;
        if (!trylock_page(page)) {
-                if (!force || !sync)
+                if (!force || mode == MIGRATE_ASYNC)
-                        goto move_newpage;
+                        goto out;
                /*
                 * It's not safe for direct compaction to call lock_page.
@@ -668,7 +707,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                 * altogether.
                 */
                if (current->flags & PF_MEMALLOC)
-                        goto move_newpage;
+                        goto out;
                lock_page(page);
        }
@@ -697,10 +736,12 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        if (PageWriteback(page)) {
                /*
-                 * For !sync, there is no point retrying as the retry loop
+                 * Only in the case of a full syncronous migration is it
-                 * is expected to be too short for PageWriteback to be cleared
+                 * necessary to wait for PageWriteback. In the async case,
+                 * the retry loop is too short and in the sync-light case,
+                 * the overhead of stalling is too much
                 */
-                if (!sync) {
+                if (mode != MIGRATE_SYNC) {
                        rc = -EBUSY;
                        goto uncharge;
                }
@@ -771,7 +812,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 skip_unmap:
        if (!page_mapped(page))
-                rc = move_to_new_page(newpage, page, remap_swapcache, sync);
+                rc = move_to_new_page(newpage, page, remap_swapcache, mode);
        if (rc && remap_swapcache)
                remove_migration_ptes(page, page);
@@ -785,27 +826,53 @@ uncharge:
                mem_cgroup_end_migration(mem, page, newpage, rc == 0);
 unlock:
        unlock_page(page);
+out:
+        return rc;
+}
-move_newpage:
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+                        struct page *page, int force, bool offlining,
+                        enum migrate_mode mode)
+{
+        int rc = 0;
+        int *result = NULL;
+        struct page *newpage = get_new_page(page, private, &result);
+        if (!newpage)
+                return -ENOMEM;
+        if (page_count(page) == 1) {
+                /* page was freed from under us. So we are done. */
+                goto out;
+        }
+        if (unlikely(PageTransHuge(page)))
+                if (unlikely(split_huge_page(page)))
+                        goto out;
+        rc = __unmap_and_move(page, newpage, force, offlining, mode);
+out:
        if (rc != -EAGAIN) {
-                /*
+                /*
-                 * A page that has been migrated has all references
+                 * A page that has been migrated has all references
-                 * removed and will be freed. A page that has not been
+                 * removed and will be freed. A page that has not been
-                 * migrated will have kepts its references and be
+                 * migrated will have kepts its references and be
-                 * restored.
+                 * restored.
-                 */
+                 */
-                list_del(&page->lru);
+                list_del(&page->lru);
                dec_zone_page_state(page, NR_ISOLATED_ANON +
                                page_is_file_cache(page));
                putback_lru_page(page);
        }
        /*
         * Move the new page to the LRU. If migration was not successful
         * then this will free the page.
         */
        putback_lru_page(newpage);
        if (result) {
                if (rc)
                        *result = rc;
@@ -835,7 +902,8 @@ move_newpage:
 */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
                                unsigned long private, struct page *hpage,
-                                int force, bool offlining, bool sync)
+                                int force, bool offlining,
+                                enum migrate_mode mode)
 {
        int rc = 0;
        int *result = NULL;
@@ -848,7 +916,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        rc = -EAGAIN;
        if (!trylock_page(hpage)) {
-                if (!force || !sync)
+                if (!force || mode != MIGRATE_SYNC)
                        goto out;
                lock_page(hpage);
        }
@@ -859,7 +927,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
        if (!page_mapped(hpage))
-                rc = move_to_new_page(new_hpage, hpage, 1, sync);
+                rc = move_to_new_page(new_hpage, hpage, 1, mode);
        if (rc)
                remove_migration_ptes(hpage, hpage);
@@ -902,7 +970,7 @@ out:
 */
 int migrate_pages(struct list_head *from,
                new_page_t get_new_page, unsigned long private, bool offlining,
-                bool sync)
+                enum migrate_mode mode)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -923,7 +991,7 @@ int migrate_pages(struct list_head *from,
                        rc = unmap_and_move(get_new_page, private,
                                                page, pass > 2, offlining,
-                                                sync);
+                                                mode);
                        switch(rc) {
                        case -ENOMEM:
@@ -953,7 +1021,7 @@ out:
 int migrate_huge_pages(struct list_head *from,
                new_page_t get_new_page, unsigned long private, bool offlining,
-                bool sync)
+                enum migrate_mode mode)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -970,7 +1038,7 @@ int migrate_huge_pages(struct list_head *from,
                        rc = unmap_and_move_huge_page(get_new_page,
                                        private, page, pass > 2, offlining,
-                                        sync);
+                                        mode);
                        switch(rc) {
                        case -ENOMEM:
@@ -1099,7 +1167,7 @@ set_status:
        err = 0;
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_page_node,
-                                (unsigned long)pm, 0, true);
+                                (unsigned long)pm, 0, MIGRATE_SYNC);
                if (err)
                        putback_lru_pages(&pagelist);
        }
diff --git a/mm/mincore.c b/mm/mincore.c
index a4e6b9d75c7..117ff549279 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -161,7 +161,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                        }
                        /* fall through */
                }
-                if (pmd_none_or_clear_bad(pmd))
+                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        mincore_unmapped_range(vma, addr, next, vec);
                else
                        mincore_pte_range(vma, pmd, addr, next, vec);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8d032de4088..71c78115c45 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
 void __mmu_notifier_release(struct mm_struct *mm)
 {
        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        /*
+         * RCU here will block mmu_notifier_unregister until
+         * ->release returns.
+         */
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
+                /*
+                 * if ->release runs before mmu_notifier_unregister it
+                 * must be handled as it's the only way for the driver
+                 * to flush all existing sptes and stop the driver
+                 * from establishing any more sptes before all the
+                 * pages in the mm are freed.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+        rcu_read_unlock();
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
                 * mmu_notifier_unregister to return.
                 */
                hlist_del_init_rcu(&mn->hlist);
-                /*
-                 * RCU here will block mmu_notifier_unregister until
-                 * ->release returns.
-                 */
-                rcu_read_lock();
-                spin_unlock(&mm->mmu_notifier_mm->lock);
-                /*
-                 * if ->release runs before mmu_notifier_unregister it
-                 * must be handled as it's the only way for the driver
-                 * to flush all existing sptes and stop the driver
-                 * from establishing any more sptes before all the
-                 * pages in the mm are freed.
-                 */
-                if (mn->ops->release)
-                        mn->ops->release(mn, mm);
-                rcu_read_unlock();
-                spin_lock(&mm->mmu_notifier_mm->lock);
        }
        spin_unlock(&mm->mmu_notifier_mm->lock);
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
        BUG_ON(atomic_read(&mm->mm_count) <= 0);
-        spin_lock(&mm->mmu_notifier_mm->lock);
        if (!hlist_unhashed(&mn->hlist)) {
-                hlist_del_rcu(&mn->hlist);
                /*
                 * RCU here will force exit_mmap to wait ->release to finish
                 * before freeing the pages.
                 */
                rcu_read_lock();
-                spin_unlock(&mm->mmu_notifier_mm->lock);
                /*
                 * exit_mmap will block in mmu_notifier_release to
                 * guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
                rcu_read_unlock();
-        } else
+                spin_lock(&mm->mmu_notifier_mm->lock);
+                hlist_del_rcu(&mn->hlist);
                spin_unlock(&mm->mmu_notifier_mm->lock);
+        }
        /*
         * Wait any running method to finish, of course including
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 6e93dc7f258..e39e3efe4a4 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -83,8 +83,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
 static void __init __free_pages_memory(unsigned long start, unsigned long end)
 {
-        int i;
+        unsigned long i, start_aligned, end_aligned;
-        unsigned long start_aligned, end_aligned;
        int order = ilog2(BITS_PER_LONG);
        start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
diff --git a/mm/nommu.c b/mm/nommu.c
index 9edc897a397..5ff9b35883e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -697,9 +697,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
        if (vma->vm_file) {
                mapping = vma->vm_file->f_mapping;
+                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
                vma_prio_tree_insert(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
+                mutex_unlock(&mapping->i_mmap_mutex);
        }
        /* add the VMA to the tree */
@@ -761,9 +763,11 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
        if (vma->vm_file) {
                mapping = vma->vm_file->f_mapping;
+                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
                vma_prio_tree_remove(vma, &mapping->i_mmap);
                flush_dcache_mmap_unlock(mapping);
+                mutex_unlock(&mapping->i_mmap_mutex);
        }
        /* remove from the MM's tree and list */
@@ -776,8 +780,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
        if (vma->vm_next)
                vma->vm_next->vm_prev = vma->vm_prev;
-        vma->vm_mm = NULL;
 }
 /*
@@ -2061,6 +2063,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
        high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        down_write(&nommu_region_sem);
+        mutex_lock(&inode->i_mapping->i_mmap_mutex);
        /* search for VMAs that fall within the dead zone */
        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
@@ -2068,6 +2071,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
                /* found one - only interested if it's shared out of the page
                 * cache */
                if (vma->vm_flags & VM_SHARED) {
+                        mutex_unlock(&inode->i_mapping->i_mmap_mutex);
                        up_write(&nommu_region_sem);
                        return -ETXTBSY; /* not quite true, but near enough */
                }
@@ -2095,6 +2099,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
                }
        }
+        mutex_unlock(&inode->i_mapping->i_mmap_mutex);
        up_write(&nommu_region_sem);
        return 0;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e4b0991ca35..7c72487ca45 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -162,7 +162,7 @@ static bool oom_unkillable_task(struct task_struct *p,
 unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
                      const nodemask_t *nodemask, unsigned long totalpages)
 {
-        int points;
+        long points;
        if (oom_unkillable_task(p, mem, nodemask))
                return 0;
@@ -303,7 +303,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
        do_each_thread(g, p) {
                unsigned int points;
-                if (!p->mm)
+                if (p->exit_state)
                        continue;
                if (oom_unkillable_task(p, mem, nodemask))
                        continue;
@@ -319,6 +319,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                 */
                if (test_tsk_thread_flag(p, TIF_MEMDIE))
                        return ERR_PTR(-1UL);
+                if (!p->mm)
+                        continue;
                if (p->flags & PF_EXITING) {
                        /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 31f69886242..903e46bff32 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -37,24 +37,22 @@
 #include <trace/events/writeback.h>
 /*
- * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
+ * Sleep at most 200ms at a time in balance_dirty_pages().
- * will look to see if it needs to force writeback or throttling.
 */
-static long ratelimit_pages = 32;
+#define MAX_PAUSE               max(HZ/5, 1)
 /*
- * When balance_dirty_pages decides that the caller needs to perform some
+ * Estimate write bandwidth at 200ms intervals.
- * non-background writeback, this is how many pages it will attempt to write.
- * It should be somewhat larger than dirtied pages to ensure that reasonably
- * large amounts of I/O are submitted.
 */
-static inline long sync_writeback_pages(unsigned long dirtied)
+#define BANDWIDTH_INTERVAL      max(HZ/5, 1)
-{
-        if (dirtied < ratelimit_pages)
-                dirtied = ratelimit_pages;
-        return dirtied + dirtied / 2;
+#define RATELIMIT_CALC_SHIFT    10
-}
+/*
+ * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
+ * will look to see if it needs to force writeback or throttling.
+ */
+static long ratelimit_pages = 32;
 /* The following parameters are exported via /proc/sys/vm */
@@ -111,6 +109,7 @@ EXPORT_SYMBOL(laptop_mode);
 /* End of sysctl-exported parameters */
+unsigned long global_dirty_limit;
 /*
 * Scale the writeback cache size proportional to the relative writeout speeds.
@@ -156,6 +155,8 @@ static void update_completion_period(void)
        int shift = calc_period_shift();
        prop_change_shift(&vm_completions, shift);
        prop_change_shift(&vm_dirties, shift);
+        writeback_set_ratelimit();
 }
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
@@ -219,6 +220,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
 */
 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
 {
+        __inc_bdi_stat(bdi, BDI_WRITTEN);
        __prop_inc_percpu_max(&vm_completions, &bdi->completions,
                              bdi->max_prop_frac);
 }
@@ -244,50 +246,8 @@ void task_dirty_inc(struct task_struct *tsk)
 static void bdi_writeout_fraction(struct backing_dev_info *bdi,
                long *numerator, long *denominator)
 {
-        if (bdi_cap_writeback_dirty(bdi)) {
+        prop_fraction_percpu(&vm_completions, &bdi->completions,
-                prop_fraction_percpu(&vm_completions, &bdi->completions,
                                numerator, denominator);
-        } else {
-                *numerator = 0;
-                *denominator = 1;
-        }
-}
-static inline void task_dirties_fraction(struct task_struct *tsk,
-                long *numerator, long *denominator)
-{
-        prop_fraction_single(&vm_dirties, &tsk->dirties,
-                                numerator, denominator);
-}
-/*
- * task_dirty_limit - scale down dirty throttling threshold for one task
- *
- * task specific dirty limit:
- *
- *   dirty -= (dirty/8) * p_{t}
- *
- * To protect light/slow dirtying tasks from heavier/fast ones, we start
- * throttling individual tasks before reaching the bdi dirty limit.
- * Relatively low thresholds will be allocated to heavy dirtiers. So when
- * dirty pages grow large, heavy dirtiers will be throttled first, which will
- * effectively curb the growth of dirty pages. Light dirtiers with high enough
- * dirty threshold may never get throttled.
- */
-static unsigned long task_dirty_limit(struct task_struct *tsk,
-                                       unsigned long bdi_dirty)
-{
-        long numerator, denominator;
-        unsigned long dirty = bdi_dirty;
-        u64 inv = dirty >> 3;
-        task_dirties_fraction(tsk, &numerator, &denominator);
-        inv *= numerator;
-        do_div(inv, denominator);
-        dirty -= inv;
-        return max(dirty, bdi_dirty/2);
 }
 /*
@@ -397,6 +357,17 @@ unsigned long determine_dirtyable_memory(void)
        return x + 1;   /* Ensure that we never return 0 */
 }
+static unsigned long dirty_freerun_ceiling(unsigned long thresh,
+                                           unsigned long bg_thresh)
+{
+        return (thresh + bg_thresh) / 2;
+}
+static unsigned long hard_dirty_limit(unsigned long thresh)
+{
+        return max(thresh, global_dirty_limit);
+}
 /*
 * global_dirty_limits - background-writeback and dirty-throttling thresholds
 *
@@ -435,12 +406,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
        }
        *pbackground = background;
        *pdirty = dirty;
+        trace_global_dirty_state(background, dirty);
 }
-/*
+/**
 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ * @bdi: the backing_dev_info to query
+ * @dirty: global dirty limit in pages
+ *
+ * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ * And the "limit" in the name is not seriously taken as hard limit in
+ * balance_dirty_pages().
 *
- * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * It allocates high/low dirty limits to fast/slow devices, in order to prevent
 * - starving fast devices
 * - piling up dirty pages (that will take long time to sync) on slow devices
 *
@@ -469,36 +448,588 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
 }
 /*
+ * Dirty position control.
+ *
+ * (o) global/bdi setpoints
+ *
+ * We want the dirty pages be balanced around the global/bdi setpoints.
+ * When the number of dirty pages is higher/lower than the setpoint, the
+ * dirty position control ratio (and hence task dirty ratelimit) will be
+ * decreased/increased to bring the dirty pages back to the setpoint.
+ *
+ *     pos_ratio = 1 << RATELIMIT_CALC_SHIFT
+ *
+ *     if (dirty < setpoint) scale up   pos_ratio
+ *     if (dirty > setpoint) scale down pos_ratio
+ *
+ *     if (bdi_dirty < bdi_setpoint) scale up   pos_ratio
+ *     if (bdi_dirty > bdi_setpoint) scale down pos_ratio
+ *
+ *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
+ *
+ * (o) global control line
+ *
+ *     ^ pos_ratio
+ *     |
+ *     |            |<===== global dirty control scope ======>|
+ * 2.0 .............*
+ *     |            .*
+ *     |            . *
+ *     |            .   *
+ *     |            .     *
+ *     |            .        *
+ *     |            .            *
+ * 1.0 ................................*
+ *     |            .                  .     *
+ *     |            .                  .          *
+ *     |            .                  .              *
+ *     |            .                  .                 *
+ *     |            .                  .                    *
+ *   0 +------------.------------------.----------------------*------------->
+ *           freerun^          setpoint^                 limit^   dirty pages
+ *
+ * (o) bdi control line
+ *
+ *     ^ pos_ratio
+ *     |
+ *     |            *
+ *     |              *
+ *     |                *
+ *     |                  *
+ *     |                    * |<=========== span ============>|
+ * 1.0 .......................*
+ *     |                      . *
+ *     |                      .   *
+ *     |                      .     *
+ *     |                      .       *
+ *     |                      .         *
+ *     |                      .           *
+ *     |                      .             *
+ *     |                      .               *
+ *     |                      .                 *
+ *     |                      .                   *
+ *     |                      .                     *
+ * 1/4 ...............................................* * * * * * * * * * * *
+ *     |                      .                         .
+ *     |                      .                           .
+ *     |                      .                             .
+ *   0 +----------------------.-------------------------------.------------->
+ *                bdi_setpoint^                    x_intercept^
+ *
+ * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
+ * be smoothly throttled down to normal if it starts high in situations like
+ * - start writing to a slow SD card and a fast disk at the same time. The SD
+ *   card's bdi_dirty may rush to many times higher than bdi_setpoint.
+ * - the bdi dirty thresh drops quickly due to change of JBOD workload
+ */
+static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
+                                        unsigned long thresh,
+                                        unsigned long bg_thresh,
+                                        unsigned long dirty,
+                                        unsigned long bdi_thresh,
+                                        unsigned long bdi_dirty)
+{
+        unsigned long write_bw = bdi->avg_write_bandwidth;
+        unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
+        unsigned long limit = hard_dirty_limit(thresh);
+        unsigned long x_intercept;
+        unsigned long setpoint;         /* dirty pages' target balance point */
+        unsigned long bdi_setpoint;
+        unsigned long span;
+        long long pos_ratio;            /* for scaling up/down the rate limit */
+        long x;
+        if (unlikely(dirty >= limit))
+                return 0;
+        /*
+         * global setpoint
+         *
+         *                           setpoint - dirty 3
+         *        f(dirty) := 1.0 + (----------------)
+         *                           limit - setpoint
+         *
+         * it's a 3rd order polynomial that subjects to
+         *
+         * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
+         * (2) f(setpoint) = 1.0 => the balance point
+         * (3) f(limit)    = 0   => the hard limit
+         * (4) df/dx      <= 0   => negative feedback control
+         * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
+         *     => fast response on large errors; small oscillation near setpoint
+         */
+        setpoint = (freerun + limit) / 2;
+        x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
+                    limit - setpoint + 1);
+        pos_ratio = x;
+        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+        pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
+        /*
+         * We have computed basic pos_ratio above based on global situation. If
+         * the bdi is over/under its share of dirty pages, we want to scale
+         * pos_ratio further down/up. That is done by the following mechanism.
+         */
+        /*
+         * bdi setpoint
+         *
+         *        f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
+         *
+         *                        x_intercept - bdi_dirty
+         *                     := --------------------------
+         *                        x_intercept - bdi_setpoint
+         *
+         * The main bdi control line is a linear function that subjects to
+         *
+         * (1) f(bdi_setpoint) = 1.0
+         * (2) k = - 1 / (8 * write_bw)  (in single bdi case)
+         *     or equally: x_intercept = bdi_setpoint + 8 * write_bw
+         *
+         * For single bdi case, the dirty pages are observed to fluctuate
+         * regularly within range
+         *        [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
+         * for various filesystems, where (2) can yield in a reasonable 12.5%
+         * fluctuation range for pos_ratio.
+         *
+         * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
+         * own size, so move the slope over accordingly and choose a slope that
+         * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
+         */
+        if (unlikely(bdi_thresh > thresh))
+                bdi_thresh = thresh;
+        bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
+        /*
+         * scale global setpoint to bdi's:
+         *      bdi_setpoint = setpoint * bdi_thresh / thresh
+         */
+        x = div_u64((u64)bdi_thresh << 16, thresh + 1);
+        bdi_setpoint = setpoint * (u64)x >> 16;
+        /*
+         * Use span=(8*write_bw) in single bdi case as indicated by
+         * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
+         *
+         *        bdi_thresh                    thresh - bdi_thresh
+         * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
+         *          thresh                            thresh
+         */
+        span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
+        x_intercept = bdi_setpoint + span;
+        if (bdi_dirty < x_intercept - span / 4) {
+                pos_ratio *= x_intercept - bdi_dirty;
+                do_div(pos_ratio, x_intercept - bdi_setpoint + 1);
+        } else
+                pos_ratio /= 4;
+        /*
+         * bdi reserve area, safeguard against dirty pool underrun and disk idle
+         * It may push the desired control point of global dirty pages higher
+         * than setpoint.
+         */
+        x_intercept = bdi_thresh / 2;
+        if (bdi_dirty < x_intercept) {
+                if (bdi_dirty > x_intercept / 8) {
+                        pos_ratio *= x_intercept;
+                        do_div(pos_ratio, bdi_dirty);
+                } else
+                        pos_ratio *= 8;
+        }
+        return pos_ratio;
+}
+static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+                                       unsigned long elapsed,
+                                       unsigned long written)
+{
+        const unsigned long period = roundup_pow_of_two(3 * HZ);
+        unsigned long avg = bdi->avg_write_bandwidth;
+        unsigned long old = bdi->write_bandwidth;
+        u64 bw;
+        /*
+         * bw = written * HZ / elapsed
+         *
+         *                   bw * elapsed + write_bandwidth * (period - elapsed)
+         * write_bandwidth = ---------------------------------------------------
+         *                                          period
+         */
+        bw = written - bdi->written_stamp;
+        bw *= HZ;
+        if (unlikely(elapsed > period)) {
+                do_div(bw, elapsed);
+                avg = bw;
+                goto out;
+        }
+        bw += (u64)bdi->write_bandwidth * (period - elapsed);
+        bw >>= ilog2(period);
+        /*
+         * one more level of smoothing, for filtering out sudden spikes
+         */
+        if (avg > old && old >= (unsigned long)bw)
+                avg -= (avg - old) >> 3;
+        if (avg < old && old <= (unsigned long)bw)
+                avg += (old - avg) >> 3;
+out:
+        bdi->write_bandwidth = bw;
+        bdi->avg_write_bandwidth = avg;
+}
+/*
+ * The global dirtyable memory and dirty threshold could be suddenly knocked
+ * down by a large amount (eg. on the startup of KVM in a swapless system).
+ * This may throw the system into deep dirty exceeded state and throttle
+ * heavy/light dirtiers alike. To retain good responsiveness, maintain
+ * global_dirty_limit for tracking slowly down to the knocked down dirty
+ * threshold.
+ */
+static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+{
+        unsigned long limit = global_dirty_limit;
+        /*
+         * Follow up in one step.
+         */
+        if (limit < thresh) {
+                limit = thresh;
+                goto update;
+        }
+        /*
+         * Follow down slowly. Use the higher one as the target, because thresh
+         * may drop below dirty. This is exactly the reason to introduce
+         * global_dirty_limit which is guaranteed to lie above the dirty pages.
+         */
+        thresh = max(thresh, dirty);
+        if (limit > thresh) {
+                limit -= (limit - thresh) >> 5;
+                goto update;
+        }
+        return;
+update:
+        global_dirty_limit = limit;
+}
+static void global_update_bandwidth(unsigned long thresh,
+                                    unsigned long dirty,
+                                    unsigned long now)
+{
+        static DEFINE_SPINLOCK(dirty_lock);
+        static unsigned long update_time;
+        /*
+         * check locklessly first to optimize away locking for the most time
+         */
+        if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+                return;
+        spin_lock(&dirty_lock);
+        if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
+                update_dirty_limit(thresh, dirty);
+                update_time = now;
+        }
+        spin_unlock(&dirty_lock);
+}
+/*
+ * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
+ *
+ * Normal bdi tasks will be curbed at or below it in long term.
+ * Obviously it should be around (write_bw / N) when there are N dd tasks.
+ */
+static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
+                                       unsigned long thresh,
+                                       unsigned long bg_thresh,
+                                       unsigned long dirty,
+                                       unsigned long bdi_thresh,
+                                       unsigned long bdi_dirty,
+                                       unsigned long dirtied,
+                                       unsigned long elapsed)
+{
+        unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
+        unsigned long limit = hard_dirty_limit(thresh);
+        unsigned long setpoint = (freerun + limit) / 2;
+        unsigned long write_bw = bdi->avg_write_bandwidth;
+        unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
+        unsigned long dirty_rate;
+        unsigned long task_ratelimit;
+        unsigned long balanced_dirty_ratelimit;
+        unsigned long pos_ratio;
+        unsigned long step;
+        unsigned long x;
+        /*
+         * The dirty rate will match the writeout rate in long term, except
+         * when dirty pages are truncated by userspace or re-dirtied by FS.
+         */
+        dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+        pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
+                                       bdi_thresh, bdi_dirty);
+        /*
+         * task_ratelimit reflects each dd's dirty rate for the past 200ms.
+         */
+        task_ratelimit = (u64)dirty_ratelimit *
+                                        pos_ratio >> RATELIMIT_CALC_SHIFT;
+        task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
+        /*
+         * A linear estimation of the "balanced" throttle rate. The theory is,
+         * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
+         * dirty_rate will be measured to be (N * task_ratelimit). So the below
+         * formula will yield the balanced rate limit (write_bw / N).
+         *
+         * Note that the expanded form is not a pure rate feedback:
+         *      rate_(i+1) = rate_(i) * (write_bw / dirty_rate)              (1)
+         * but also takes pos_ratio into account:
+         *      rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio  (2)
+         *
+         * (1) is not realistic because pos_ratio also takes part in balancing
+         * the dirty rate.  Consider the state
+         *      pos_ratio = 0.5                                              (3)
+         *      rate = 2 * (write_bw / N)                                    (4)
+         * If (1) is used, it will stuck in that state! Because each dd will
+         * be throttled at
+         *      task_ratelimit = pos_ratio * rate = (write_bw / N)           (5)
+         * yielding
+         *      dirty_rate = N * task_ratelimit = write_bw                   (6)
+         * put (6) into (1) we get
+         *      rate_(i+1) = rate_(i)                                        (7)
+         *
+         * So we end up using (2) to always keep
+         *      rate_(i+1) ~= (write_bw / N)                                 (8)
+         * regardless of the value of pos_ratio. As long as (8) is satisfied,
+         * pos_ratio is able to drive itself to 1.0, which is not only where
+         * the dirty count meet the setpoint, but also where the slope of
+         * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
+         */
+        balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
+                                           dirty_rate | 1);
+        /*
+         * We could safely do this and return immediately:
+         *
+         *      bdi->dirty_ratelimit = balanced_dirty_ratelimit;
+         *
+         * However to get a more stable dirty_ratelimit, the below elaborated
+         * code makes use of task_ratelimit to filter out sigular points and
+         * limit the step size.
+         *
+         * The below code essentially only uses the relative value of
+         *
+         *      task_ratelimit - dirty_ratelimit
+         *      = (pos_ratio - 1) * dirty_ratelimit
+         *
+         * which reflects the direction and size of dirty position error.
+         */
+        /*
+         * dirty_ratelimit will follow balanced_dirty_ratelimit iff
+         * task_ratelimit is on the same side of dirty_ratelimit, too.
+         * For example, when
+         * - dirty_ratelimit > balanced_dirty_ratelimit
+         * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
+         * lowering dirty_ratelimit will help meet both the position and rate
+         * control targets. Otherwise, don't update dirty_ratelimit if it will
+         * only help meet the rate target. After all, what the users ultimately
+         * feel and care are stable dirty rate and small position error.
+         *
+         * |task_ratelimit - dirty_ratelimit| is used to limit the step size
+         * and filter out the sigular points of balanced_dirty_ratelimit. Which
+         * keeps jumping around randomly and can even leap far away at times
+         * due to the small 200ms estimation period of dirty_rate (we want to
+         * keep that period small to reduce time lags).
+         */
+        step = 0;
+        if (dirty < setpoint) {
+                x = min(bdi->balanced_dirty_ratelimit,
+                         min(balanced_dirty_ratelimit, task_ratelimit));
+                if (dirty_ratelimit < x)
+                        step = x - dirty_ratelimit;
+        } else {
+                x = max(bdi->balanced_dirty_ratelimit,
+                         max(balanced_dirty_ratelimit, task_ratelimit));
+                if (dirty_ratelimit > x)
+                        step = dirty_ratelimit - x;
+        }
+        /*
+         * Don't pursue 100% rate matching. It's impossible since the balanced
+         * rate itself is constantly fluctuating. So decrease the track speed
+         * when it gets close to the target. Helps eliminate pointless tremors.
+         */
+        step >>= dirty_ratelimit / (2 * step + 1);
+        /*
+         * Limit the tracking speed to avoid overshooting.
+         */
+        step = (step + 7) / 8;
+        if (dirty_ratelimit < balanced_dirty_ratelimit)
+                dirty_ratelimit += step;
+        else
+                dirty_ratelimit -= step;
+        bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+        bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
+        trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
+}
+void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+                            unsigned long thresh,
+                            unsigned long bg_thresh,
+                            unsigned long dirty,
+                            unsigned long bdi_thresh,
+                            unsigned long bdi_dirty,
+                            unsigned long start_time)
+{
+        unsigned long now = jiffies;
+        unsigned long elapsed = now - bdi->bw_time_stamp;
+        unsigned long dirtied;
+        unsigned long written;
+        /*
+         * rate-limit, only update once every 200ms.
+         */
+        if (elapsed < BANDWIDTH_INTERVAL)
+                return;
+        dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
+        written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+        /*
+         * Skip quiet periods when disk bandwidth is under-utilized.
+         * (at least 1s idle time between two flusher runs)
+         */
+        if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+                goto snapshot;
+        if (thresh) {
+                global_update_bandwidth(thresh, dirty, now);
+                bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
+                                           bdi_thresh, bdi_dirty,
+                                           dirtied, elapsed);
+        }
+        bdi_update_write_bandwidth(bdi, elapsed, written);
+snapshot:
+        bdi->dirtied_stamp = dirtied;
+        bdi->written_stamp = written;
+        bdi->bw_time_stamp = now;
+}
+static void bdi_update_bandwidth(struct backing_dev_info *bdi,
+                                 unsigned long thresh,
+                                 unsigned long bg_thresh,
+                                 unsigned long dirty,
+                                 unsigned long bdi_thresh,
+                                 unsigned long bdi_dirty,
+                                 unsigned long start_time)
+{
+        if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
+                return;
+        spin_lock(&bdi->wb.list_lock);
+        __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
+                               bdi_thresh, bdi_dirty, start_time);
+        spin_unlock(&bdi->wb.list_lock);
+}
+/*
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * will look to see if it needs to start dirty throttling.
+ *
+ * If dirty_poll_interval is too low, big NUMA machines will call the expensive
+ * global_page_state() too often. So scale it near-sqrt to the safety margin
+ * (the number of pages we may dirty without exceeding the dirty limits).
+ */
+static unsigned long dirty_poll_interval(unsigned long dirty,
+                                         unsigned long thresh)
+{
+        if (thresh > dirty)
+                return 1UL << (ilog2(thresh - dirty) >> 1);
+        return 1;
+}
+static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
+                                   unsigned long bdi_dirty)
+{
+        unsigned long bw = bdi->avg_write_bandwidth;
+        unsigned long hi = ilog2(bw);
+        unsigned long lo = ilog2(bdi->dirty_ratelimit);
+        unsigned long t;
+        /* target for 20ms max pause on 1-dd case */
+        t = HZ / 50;
+        /*
+         * Scale up pause time for concurrent dirtiers in order to reduce CPU
+         * overheads.
+         *
+         * (N * 20ms) on 2^N concurrent tasks.
+         */
+        if (hi > lo)
+                t += (hi - lo) * (20 * HZ) / 1024;
+        /*
+         * Limit pause time for small memory systems. If sleeping for too long
+         * time, a small pool of dirty/writeback pages may go empty and disk go
+         * idle.
+         *
+         * 8 serves as the safety ratio.
+         */
+        if (bdi_dirty)
+                t = min(t, bdi_dirty * HZ / (8 * bw + 1));
+        /*
+         * The pause time will be settled within range (max_pause/4, max_pause).
+         * Apply a minimal value of 4 to get a non-zero max_pause/4.
+         */
+        return clamp_val(t, 4, MAX_PAUSE);
+}
+/*
 * balance_dirty_pages() must be called by processes which are generating dirty
 * data.  It looks at the number of dirty pages in the machine and will force
- * the caller to perform writeback if the system is over `vm_dirty_ratio'.
+ * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
 * If we're over `background_thresh' then the writeback threads are woken to
 * perform some writeout.
 */
 static void balance_dirty_pages(struct address_space *mapping,
-                                unsigned long write_chunk)
+                                unsigned long pages_dirtied)
 {
-        long nr_reclaimable, bdi_nr_reclaimable;
+        unsigned long nr_reclaimable;   /* = file_dirty + unstable_nfs */
-        long nr_writeback, bdi_nr_writeback;
+        unsigned long bdi_reclaimable;
+        unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
+        unsigned long bdi_dirty;
+        unsigned long freerun;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        unsigned long bdi_thresh;
-        unsigned long pages_written = 0;
+        long pause = 0;
-        unsigned long pause = 1;
+        long max_pause;
        bool dirty_exceeded = false;
+        unsigned long task_ratelimit;
+        unsigned long dirty_ratelimit;
+        unsigned long pos_ratio;
        struct backing_dev_info *bdi = mapping->backing_dev_info;
+        unsigned long start_time = jiffies;
        for (;;) {
-                struct writeback_control wbc = {
+                /*
-                        .sync_mode      = WB_SYNC_NONE,
+                 * Unstable writes are a feature of certain networked
-                        .older_than_this = NULL,
+                 * filesystems (i.e. NFS) in which data may have been
-                        .nr_to_write    = write_chunk,
+                 * written to the server's write cache, but has not yet
-                        .range_cyclic   = 1,
+                 * been flushed to permanent storage.
-                };
+                 */
                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
                                        global_page_state(NR_UNSTABLE_NFS);
-                nr_writeback = global_page_state(NR_WRITEBACK);
+                nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
                global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -507,12 +1038,28 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * catch-up. This avoids (excessively) small writeouts
                 * when the bdi limits are ramping up.
                 */
-                if (nr_reclaimable + nr_writeback <=
+                freerun = dirty_freerun_ceiling(dirty_thresh,
-                                (background_thresh + dirty_thresh) / 2)
+                                                background_thresh);
+                if (nr_dirty <= freerun)
                        break;
+                if (unlikely(!writeback_in_progress(bdi)))
+                        bdi_start_background_writeback(bdi);
+                /*
+                 * bdi_thresh is not treated as some limiting factor as
+                 * dirty_thresh, due to reasons
+                 * - in JBOD setup, bdi_thresh can fluctuate a lot
+                 * - in a system with HDD and USB key, the USB key may somehow
+                 *   go into state (bdi_dirty >> bdi_thresh) either because
+                 *   bdi_dirty starts high, or because bdi_thresh drops low.
+                 *   In this case we don't want to hard throttle the USB key
+                 *   dirtiers for 100 seconds until bdi_dirty drops under
+                 *   bdi_thresh. Instead the auxiliary bdi control line in
+                 *   bdi_position_ratio() will let the dirtier task progress
+                 *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+                 */
                bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-                bdi_thresh = task_dirty_limit(current, bdi_thresh);
                /*
                 * In order to avoid the stacked BDI deadlock we need
@@ -524,63 +1071,98 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * actually dirty; with m+n sitting in the percpu
                 * deltas.
                 */
-                if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+                if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
-                        bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+                        bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-                        bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+                        bdi_dirty = bdi_reclaimable +
+                                    bdi_stat_sum(bdi, BDI_WRITEBACK);
                } else {
-                        bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+                        bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-                        bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+                        bdi_dirty = bdi_reclaimable +
+                                    bdi_stat(bdi, BDI_WRITEBACK);
                }
-                /*
+                dirty_exceeded = (bdi_dirty > bdi_thresh) ||
-                 * The bdi thresh is somehow "soft" limit derived from the
+                                  (nr_dirty > dirty_thresh);
-                 * global "hard" limit. The former helps to prevent heavy IO
+                if (dirty_exceeded && !bdi->dirty_exceeded)
-                 * bdi or process from holding back light ones; The latter is
+                        bdi->dirty_exceeded = 1;
-                 * the last resort safeguard.
-                 */
-                dirty_exceeded =
-                        (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
-                        || (nr_reclaimable + nr_writeback > dirty_thresh);
-                if (!dirty_exceeded)
+                bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
-                        break;
+                                     nr_dirty, bdi_thresh, bdi_dirty,
+                                     start_time);
-                if (!bdi->dirty_exceeded)
+                max_pause = bdi_max_pause(bdi, bdi_dirty);
-                        bdi->dirty_exceeded = 1;
-                /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
+                dirty_ratelimit = bdi->dirty_ratelimit;
-                 * Unstable writes are a feature of certain networked
+                pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
-                 * filesystems (i.e. NFS) in which data may have been
+                                               background_thresh, nr_dirty,
-                 * written to the server's write cache, but has not yet
+                                               bdi_thresh, bdi_dirty);
-                 * been flushed to permanent storage.
+                if (unlikely(pos_ratio == 0)) {
-                 * Only move pages to writeback if this bdi is over its
+                        pause = max_pause;
-                 * threshold otherwise wait until the disk writes catch
+                        goto pause;
-                 * up.
+                }
-                 */
+                task_ratelimit = (u64)dirty_ratelimit *
-                trace_wbc_balance_dirty_start(&wbc, bdi);
+                                        pos_ratio >> RATELIMIT_CALC_SHIFT;
-                if (bdi_nr_reclaimable > bdi_thresh) {
+                pause = (HZ * pages_dirtied) / (task_ratelimit | 1);
-                        writeback_inodes_wb(&bdi->wb, &wbc);
+                if (unlikely(pause <= 0)) {
-                        pages_written += write_chunk - wbc.nr_to_write;
+                        trace_balance_dirty_pages(bdi,
-                        trace_wbc_balance_dirty_written(&wbc, bdi);
+                                                  dirty_thresh,
-                        if (pages_written >= write_chunk)
+                                                  background_thresh,
-                                break;          /* We've done our duty */
+                                                  nr_dirty,
+                                                  bdi_thresh,
+                                                  bdi_dirty,
+                                                  dirty_ratelimit,
+                                                  task_ratelimit,
+                                                  pages_dirtied,
+                                                  pause,
+                                                  start_time);
+                        pause = 1; /* avoid resetting nr_dirtied_pause below */
+                        break;
                }
-                trace_wbc_balance_dirty_wait(&wbc, bdi);
+                pause = min(pause, max_pause);
+pause:
+                trace_balance_dirty_pages(bdi,
+                                          dirty_thresh,
+                                          background_thresh,
+                                          nr_dirty,
+                                          bdi_thresh,
+                                          bdi_dirty,
+                                          dirty_ratelimit,
+                                          task_ratelimit,
+                                          pages_dirtied,
+                                          pause,
+                                          start_time);
                __set_current_state(TASK_UNINTERRUPTIBLE);
                io_schedule_timeout(pause);
                /*
-                 * Increase the delay for each loop, up to our previous
+                 * This is typically equal to (nr_dirty < dirty_thresh) and can
-                 * default of taking a 100ms nap.
+                 * also keep "1000+ dd on a slow USB stick" under control.
                 */
-                pause <<= 1;
+                if (task_ratelimit)
-                if (pause > HZ / 10)
+                        break;
-                        pause = HZ / 10;
        }
        if (!dirty_exceeded && bdi->dirty_exceeded)
                bdi->dirty_exceeded = 0;
+        current->nr_dirtied = 0;
+        if (pause == 0) { /* in freerun area */
+                current->nr_dirtied_pause =
+                                dirty_poll_interval(nr_dirty, dirty_thresh);
+        } else if (pause <= max_pause / 4 &&
+                   pages_dirtied >= current->nr_dirtied_pause) {
+                current->nr_dirtied_pause = clamp_val(
+                                        dirty_ratelimit * (max_pause / 2) / HZ,
+                                        pages_dirtied + pages_dirtied / 8,
+                                        pages_dirtied * 4);
+        } else if (pause >= max_pause) {
+                current->nr_dirtied_pause = 1 | clamp_val(
+                                        dirty_ratelimit * (max_pause / 2) / HZ,
+                                        pages_dirtied / 4,
+                                        pages_dirtied - pages_dirtied / 8);
+        }
        if (writeback_in_progress(bdi))
                return;
@@ -592,8 +1174,10 @@ static void balance_dirty_pages(struct address_space *mapping,
         * In normal mode, we start background writeout at the lower
         * background_thresh, to keep the amount of dirty memory low.
         */
-        if ((laptop_mode && pages_written) ||
+        if (laptop_mode)
-            (!laptop_mode && (nr_reclaimable > background_thresh)))
+                return;
+        if (nr_reclaimable > background_thresh)
                bdi_start_background_writeback(bdi);
 }
@@ -607,7 +1191,7 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
        }
 }
-static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
+static DEFINE_PER_CPU(int, bdp_ratelimits);
 /**
 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
@@ -626,28 +1210,40 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
                                        unsigned long nr_pages_dirtied)
 {
-        unsigned long ratelimit;
+        struct backing_dev_info *bdi = mapping->backing_dev_info;
-        unsigned long *p;
+        int ratelimit;
+        int *p;
+        if (!bdi_cap_account_dirty(bdi))
+                return;
-        ratelimit = ratelimit_pages;
+        ratelimit = current->nr_dirtied_pause;
-        if (mapping->backing_dev_info->dirty_exceeded)
+        if (bdi->dirty_exceeded)
-                ratelimit = 8;
+                ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
+        current->nr_dirtied += nr_pages_dirtied;
+        preempt_disable();
        /*
-         * Check the rate limiting. Also, we do not want to throttle real-time
+         * This prevents one CPU to accumulate too many dirtied pages without
-         * tasks in balance_dirty_pages(). Period.
+         * calling into balance_dirty_pages(), which can happen when there are
+         * 1000+ tasks, all of them start dirtying pages at exactly the same
+         * time, hence all honoured too large initial task->nr_dirtied_pause.
         */
-        preempt_disable();
        p =  &__get_cpu_var(bdp_ratelimits);
-        *p += nr_pages_dirtied;
+        if (unlikely(current->nr_dirtied >= ratelimit))
-        if (unlikely(*p >= ratelimit)) {
-                ratelimit = sync_writeback_pages(*p);
                *p = 0;
-                preempt_enable();
+        else {
-                balance_dirty_pages(mapping, ratelimit);
+                *p += nr_pages_dirtied;
-                return;
+                if (unlikely(*p >= ratelimit_pages)) {
+                        *p = 0;
+                        ratelimit = 0;
+                }
        }
        preempt_enable();
+        if (unlikely(current->nr_dirtied >= ratelimit))
+                balance_dirty_pages(mapping, current->nr_dirtied);
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
@@ -703,7 +1299,8 @@ void laptop_mode_timer_fn(unsigned long data)
         * threshold
         */
        if (bdi_has_dirty_io(&q->backing_dev_info))
-                bdi_start_writeback(&q->backing_dev_info, nr_pages);
+                bdi_start_writeback(&q->backing_dev_info, nr_pages,
+                                        WB_REASON_LAPTOP_TIMER);
 }
 /*
@@ -742,22 +1339,17 @@ void laptop_sync_completion(void)
 *
 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
- * thresholds before writeback cuts in.
+ * thresholds.
- *
- * But the limit should not be set too high.  Because it also controls the
- * amount of memory which the balance_dirty_pages() caller has to write back.
- * If this is too large then the caller will block on the IO queue all the
- * time.  So limit it to four megabytes - the balance_dirty_pages() caller
- * will write six megabyte chunks, max.
 */
 void writeback_set_ratelimit(void)
 {
-        ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
+        unsigned long background_thresh;
+        unsigned long dirty_thresh;
+        global_dirty_limits(&background_thresh, &dirty_thresh);
+        ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
        if (ratelimit_pages < 16)
                ratelimit_pages = 16;
-        if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
-                ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
 }
 static int __cpuinit
@@ -892,12 +1484,12 @@ int write_cache_pages(struct address_space *mapping,
                        range_whole = 1;
                cycled = 1; /* ignore range_cyclic tests */
        }
-        if (wbc->sync_mode == WB_SYNC_ALL)
+        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;
 retry:
-        if (wbc->sync_mode == WB_SYNC_ALL)
+        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, index, end);
        done_index = index;
        while (!done && (index <= end)) {
@@ -1127,6 +1719,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
                __inc_zone_page_state(page, NR_FILE_DIRTY);
                __inc_zone_page_state(page, NR_DIRTIED);
                __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+                __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
                task_dirty_inc(current);
                task_io_account_write(PAGE_CACHE_SIZE);
        }
@@ -1141,7 +1734,6 @@ EXPORT_SYMBOL(account_page_dirtied);
 void account_page_writeback(struct page *page)
 {
        inc_zone_page_state(page, NR_WRITEBACK);
-        inc_zone_page_state(page, NR_WRITTEN);
 }
 EXPORT_SYMBOL(account_page_writeback);
@@ -1358,8 +1950,10 @@ int test_clear_page_writeback(struct page *page)
        } else {
                ret = TestClearPageWriteback(page);
        }
-        if (ret)
+        if (ret) {
                dec_zone_page_state(page, NR_WRITEBACK);
+                inc_zone_page_state(page, NR_WRITTEN);
+        }
        return ret;
 }
@@ -1405,10 +1999,6 @@ EXPORT_SYMBOL(test_set_page_writeback);
 */
 int mapping_tagged(struct address_space *mapping, int tag)
 {
-        int ret;
+        return radix_tree_tagged(&mapping->page_tree, tag);
-        rcu_read_lock();
-        ret = radix_tree_tagged(&mapping->page_tree, tag);
-        rcu_read_unlock();
-        return ret;
 }
 EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e8985acdab..87b0a3f074e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -127,6 +127,20 @@ void pm_restrict_gfp_mask(void)
        saved_gfp_mask = gfp_allowed_mask;
        gfp_allowed_mask &= ~GFP_IOFS;
 }
+static bool pm_suspending(void)
+{
+        if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+                return false;
+        return true;
+}
+#else
+static bool pm_suspending(void)
+{
+        return false;
+}
 #endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -176,6 +190,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
 };
 int min_free_kbytes = 1024;
+int min_free_order_shift = 1;
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -355,8 +370,8 @@ void prep_compound_page(struct page *page, unsigned long order)
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
                __SetPageTail(p);
+                set_page_count(p, 0);
                p->first_page = page;
        }
 }
@@ -1487,7 +1502,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                free_pages -= z->free_area[o].nr_free << o;
                /* Require fewer higher order pages to be free */
-                min >>= 1;
+                min >>= min_free_order_shift;
                if (free_pages <= min)
                        return false;
@@ -1616,6 +1631,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
        set_bit(i, zlc->fullzones);
 }
+/*
+ * clear all zones full, called after direct reclaim makes progress so that
+ * a zone that was recently full is not skipped over for up to a second
+ */
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+        struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+        zlc = zonelist->zlcache_ptr;
+        if (!zlc)
+                return;
+        bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+}
 #else   /* CONFIG_NUMA */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1632,6 +1662,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+}
 #endif  /* CONFIG_NUMA */
 /*
@@ -1664,7 +1698,7 @@ zonelist_scan:
                                continue;
                if ((alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
-                                goto try_next_zone;
+                                continue;
                BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1676,17 +1710,36 @@ zonelist_scan:
                                    classzone_idx, alloc_flags))
                                goto try_this_zone;
+                        if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+                                /*
+                                 * we do zlc_setup if there are multiple nodes
+                                 * and before considering the first zone allowed
+                                 * by the cpuset.
+                                 */
+                                allowednodes = zlc_setup(zonelist, alloc_flags);
+                                zlc_active = 1;
+                                did_zlc_setup = 1;
+                        }
                        if (zone_reclaim_mode == 0)
                                goto this_zone_full;
+                        /*
+                         * As we may have just activated ZLC, check if the first
+                         * eligible zone has failed zone_reclaim recently.
+                         */
+                        if (NUMA_BUILD && zlc_active &&
+                                !zlc_zone_worth_trying(zonelist, z, allowednodes))
+                                continue;
                        ret = zone_reclaim(zone, gfp_mask, order);
                        switch (ret) {
                        case ZONE_RECLAIM_NOSCAN:
                                /* did not scan */
-                                goto try_next_zone;
+                                continue;
                        case ZONE_RECLAIM_FULL:
                                /* scanned but unreclaimable */
-                                goto this_zone_full;
+                                continue;
                        default:
                                /* did we reclaim enough */
                                if (!zone_watermark_ok(zone, order, mark,
@@ -1703,16 +1756,6 @@ try_this_zone:
 this_zone_full:
                if (NUMA_BUILD)
                        zlc_mark_zone_full(zonelist, z);
-try_next_zone:
-                if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
-                        /*
-                         * we do zlc_setup after the first zone is tried but only
-                         * if there are multiple nodes make it worthwhile
-                         */
-                        allowednodes = zlc_setup(zonelist, alloc_flags);
-                        zlc_active = 1;
-                        did_zlc_setup = 1;
-                }
        }
        if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1869,14 +1912,20 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress,
+        int migratetype, bool sync_migration,
-        bool sync_migration)
+        bool *deferred_compaction,
+        unsigned long *did_some_progress)
 {
        struct page *page;
-        if (!order || compaction_deferred(preferred_zone))
+        if (!order)
                return NULL;
+        if (compaction_deferred(preferred_zone)) {
+                *deferred_compaction = true;
+                return NULL;
+        }
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                nodemask, sync_migration);
@@ -1904,7 +1953,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                 * but not enough to satisfy watermarks.
                 */
                count_vm_event(COMPACTFAIL);
-                defer_compaction(preferred_zone);
+                /*
+                 * As async compaction considers a subset of pageblocks, only
+                 * defer if the failure was a sync compaction failure.
+                 */
+                if (sync_migration)
+                        defer_compaction(preferred_zone);
                cond_resched();
        }
@@ -1916,8 +1971,9 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress,
+        int migratetype, bool sync_migration,
-        bool sync_migration)
+        bool *deferred_compaction,
+        unsigned long *did_some_progress)
 {
        return NULL;
 }
@@ -1954,6 +2010,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!(*did_some_progress)))
                return NULL;
+        /* After successful reclaim, reconsider all zones for allocation */
+        if (NUMA_BUILD)
+                zlc_clear_zones_full(zonelist);
 retry:
        page = get_page_from_freelist(gfp_mask, nodemask, order,
                                        zonelist, high_zoneidx,
@@ -2063,6 +2123,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        unsigned long pages_reclaimed = 0;
        unsigned long did_some_progress;
        bool sync_migration = false;
+        bool deferred_compaction = false;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2143,12 +2204,22 @@ rebalance:
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                        migratetype, &did_some_progress,
+                                        migratetype, sync_migration,
-                                        sync_migration);
+                                        &deferred_compaction,
+                                        &did_some_progress);
        if (page)
                goto got_pg;
        sync_migration = true;
+        /*
+         * If compaction is deferred for high-order allocations, it is because
+         * sync compaction recently failed. In this is the case and the caller
+         * has requested the system not be heavily disrupted, fail the
+         * allocation now instead of entering direct reclaim
+         */
+        if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+                goto nopage;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                        zonelist, high_zoneidx,
@@ -2193,6 +2264,14 @@ rebalance:
                        goto restart;
                }
+                /*
+                 * Suspend converts GFP_KERNEL to __GFP_WAIT which can
+                 * prevent reclaim making forward progress without
+                 * invoking OOM. Bail if we are suspending
+                 */
+                if (pm_suspending())
+                        goto nopage;
        }
        /* Check if we should retry the allocation */
@@ -2211,8 +2290,9 @@ rebalance:
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                        migratetype, &did_some_progress,
+                                        migratetype, sync_migration,
-                                        sync_migration);
+                                        &deferred_compaction,
+                                        &did_some_progress);
                if (page)
                        goto got_pg;
        }
@@ -2236,8 +2316,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        struct zone *preferred_zone;
-        struct page *page;
+        struct page *page = NULL;
        int migratetype = allocflags_to_migratetype(gfp_mask);
+        unsigned int cpuset_mems_cookie;
        gfp_mask &= gfp_allowed_mask;
@@ -2256,15 +2337,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
-        get_mems_allowed();
+retry_cpuset:
+        cpuset_mems_cookie = get_mems_allowed();
        /* The preferred zone is used for statistics later */
        first_zones_zonelist(zonelist, high_zoneidx,
                                nodemask ? : &cpuset_current_mems_allowed,
                                &preferred_zone);
-        if (!preferred_zone) {
+        if (!preferred_zone)
-                put_mems_allowed();
+                goto out;
-                return NULL;
-        }
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2274,9 +2355,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                page = __alloc_pages_slowpath(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
-        put_mems_allowed();
        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+out:
+        /*
+         * When updating a task's mems_allowed, it is possible to race with
+         * parallel threads in such a way that an allocation can fail while
+         * the mask is being updated. If a page allocation is about to fail,
+         * check if the cpuset changed during allocation and if so, retry.
+         */
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                goto retry_cpuset;
        return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2500,13 +2591,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
        bool ret = false;
+        unsigned int cpuset_mems_cookie;
        if (!(flags & SHOW_MEM_FILTER_NODES))
                goto out;
-        get_mems_allowed();
+        do {
-        ret = !node_isset(nid, cpuset_current_mems_allowed);
+                cpuset_mems_cookie = get_mems_allowed();
-        put_mems_allowed();
+                ret = !node_isset(nid, cpuset_current_mems_allowed);
+        } while (!put_mems_allowed(cpuset_mems_cookie));
 out:
        return ret;
 }
@@ -3356,9 +3449,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
        unsigned long block_migratetype;
        int reserve;
-        /* Get the start pfn, end pfn and the number of blocks to reserve */
+        /*
+         * Get the start pfn, end pfn and the number of blocks to reserve
+         * We have to be careful to be aligned to pageblock_nr_pages to
+         * make sure that we always check pfn_valid for the first page in
+         * the block.
+         */
        start_pfn = zone->zone_start_pfn;
        end_pfn = start_pfn + zone->spanned_pages;
+        start_pfn = roundup(start_pfn, pageblock_nr_pages);
        reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
                                                        pageblock_order;
@@ -3380,25 +3479,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                if (page_to_nid(page) != zone_to_nid(zone))
                        continue;
-                /* Blocks with reserved pages will never free, skip them. */
-                block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
-                if (pageblock_is_reserved(pfn, block_end_pfn))
-                        continue;
                block_migratetype = get_pageblock_migratetype(page);
-                /* If this block is reserved, account for it */
+                /* Only test what is necessary when the reserves are not met */
-                if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
+                if (reserve > 0) {
-                        reserve--;
+                        /*
-                        continue;
+                         * Blocks with reserved pages will never free, skip
-                }
+                         * them.
+                         */
+                        block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+                        if (pageblock_is_reserved(pfn, block_end_pfn))
+                                continue;
-                /* Suitable for reserving if this block is movable */
+                        /* If this block is reserved, account for it */
-                if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
+                        if (block_migratetype == MIGRATE_RESERVE) {
-                        set_pageblock_migratetype(page, MIGRATE_RESERVE);
+                                reserve--;
-                        move_freepages_block(zone, page, MIGRATE_RESERVE);
+                                continue;
-                        reserve--;
+                        }
-                        continue;
+                        /* Suitable for reserving if this block is movable */
+                        if (block_migratetype == MIGRATE_MOVABLE) {
+                                set_pageblock_migratetype(page,
+                                                        MIGRATE_RESERVE);
+                                move_freepages_block(zone, page,
+                                                        MIGRATE_RESERVE);
+                                reserve--;
+                                continue;
+                        }
                }
                /*
@@ -5527,6 +5634,17 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
 bool is_pageblock_removable_nolock(struct page *page)
 {
        struct zone *zone = page_zone(page);
+        unsigned long pfn = page_to_pfn(page);
+        /*
+         * We have to be careful here because we are iterating over memory
+         * sections which are not zone aware so we might end up outside of
+         * the zone but still within the section.
+         */
+        if (!zone || zone->zone_start_pfn > pfn ||
+                        zone->zone_start_pfn + zone->spanned_pages <= pfn)
+                return false;
        return __count_immobile_pages(zone, page, 0);
 }
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index c3450d53361..87eac0ea2bf 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -59,7 +59,7 @@ again:
                        continue;
                split_huge_page_pmd(walk->mm, pmd);
-                if (pmd_none_or_clear_bad(pmd))
+                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        goto again;
                err = walk_pte_range(pmd, addr, next, walk);
                if (err)
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index ea534960a04..bfad7246665 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -143,8 +143,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
                                 int page_start, int page_end)
 {
        flush_cache_vunmap(
-                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+                pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
-                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+                pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
 }
 static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
@@ -206,8 +206,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
                                      int page_start, int page_end)
 {
        flush_tlb_kernel_range(
-                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+                pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
-                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+                pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
 }
 static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@@ -284,8 +284,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
                                int page_start, int page_end)
 {
        flush_cache_vmap(
-                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+                pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
-                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+                pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
 }
 /**
diff --git a/mm/percpu.c b/mm/percpu.c
index bf80e55dbed..af0cc7a58f9 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -116,9 +116,9 @@ static int pcpu_atom_size __read_mostly;
 static int pcpu_nr_slots __read_mostly;
 static size_t pcpu_chunk_struct_size __read_mostly;
-/* cpus with the lowest and highest unit numbers */
+/* cpus with the lowest and highest unit addresses */
-static unsigned int pcpu_first_unit_cpu __read_mostly;
+static unsigned int pcpu_low_unit_cpu __read_mostly;
-static unsigned int pcpu_last_unit_cpu __read_mostly;
+static unsigned int pcpu_high_unit_cpu __read_mostly;
 /* the address of the first chunk which starts with the kernel static area */
 void *pcpu_base_addr __read_mostly;
@@ -984,19 +984,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
 {
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        bool in_first_chunk = false;
-        unsigned long first_start, first_end;
+        unsigned long first_low, first_high;
        unsigned int cpu;
        /*
-         * The following test on first_start/end isn't strictly
+         * The following test on unit_low/high isn't strictly
         * necessary but will speed up lookups of addresses which
         * aren't in the first chunk.
         */
-        first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0);
+        first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
-        first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu,
+        first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
-                                    pcpu_unit_pages);
+                                     pcpu_unit_pages);
-        if ((unsigned long)addr >= first_start &&
+        if ((unsigned long)addr >= first_low &&
-            (unsigned long)addr < first_end) {
+            (unsigned long)addr < first_high) {
                for_each_possible_cpu(cpu) {
                        void *start = per_cpu_ptr(base, cpu);
@@ -1011,9 +1011,11 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
                if (!is_vmalloc_addr(addr))
                        return __pa(addr);
                else
-                        return page_to_phys(vmalloc_to_page(addr));
+                        return page_to_phys(vmalloc_to_page(addr)) +
+                               offset_in_page(addr);
        } else
-                return page_to_phys(pcpu_addr_to_page(addr));
+                return page_to_phys(pcpu_addr_to_page(addr)) +
+                       offset_in_page(addr);
 }
 /**
@@ -1233,7 +1235,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        for (cpu = 0; cpu < nr_cpu_ids; cpu++)
                unit_map[cpu] = UINT_MAX;
-        pcpu_first_unit_cpu = NR_CPUS;
+        pcpu_low_unit_cpu = NR_CPUS;
+        pcpu_high_unit_cpu = NR_CPUS;
        for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
                const struct pcpu_group_info *gi = &ai->groups[group];
@@ -1253,9 +1257,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                        unit_map[cpu] = unit + i;
                        unit_off[cpu] = gi->base_offset + i * ai->unit_size;
-                        if (pcpu_first_unit_cpu == NR_CPUS)
+                        /* determine low/high unit_cpu */
-                                pcpu_first_unit_cpu = cpu;
+                        if (pcpu_low_unit_cpu == NR_CPUS ||
-                        pcpu_last_unit_cpu = cpu;
+                            unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
+                                pcpu_low_unit_cpu = cpu;
+                        if (pcpu_high_unit_cpu == NR_CPUS ||
+                            unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
+                                pcpu_high_unit_cpu = cpu;
                }
        }
        pcpu_nr_units = unit;
@@ -1622,6 +1630,16 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
                areas[group] = ptr;
                base = min(ptr, base);
+        }
+        /*
+         * Copy data and free unused parts.  This should happen after all
+         * allocations are complete; otherwise, we may end up with
+         * overlapping groups.
+         */
+        for (group = 0; group < ai->nr_groups; group++) {
+                struct pcpu_group_info *gi = &ai->groups[group];
+                void *ptr = areas[group];
                for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
                        if (gi->cpu_map[i] == NR_CPUS) {
diff --git a/mm/shmem.c b/mm/shmem.c
index fcedf5464eb..b5a1b89b2d6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -51,6 +51,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/shmem_fs.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/splice.h>
 #include <linux/security.h>
 #include <linux/swapops.h>
 #include <linux/mempolicy.h>
@@ -126,8 +127,15 @@ static unsigned long shmem_default_max_inodes(void)
 }
 #endif
-static int shmem_getpage(struct inode *inode, unsigned long idx,
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
-                         struct page **pagep, enum sgp_type sgp, int *type);
+        struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
+static inline int shmem_getpage(struct inode *inode, pgoff_t index,
+        struct page **pagep, enum sgp_type sgp, int *fault_type)
+{
+        return shmem_getpage_gfp(inode, index, pagep, sgp,
+                        mapping_gfp_mask(inode->i_mapping), fault_type);
+}
 static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
 {
@@ -405,10 +413,12 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
 * @info:       info structure for the inode
 * @index:      index of the page to find
 * @sgp:        check and recheck i_size? skip allocation?
+ * @gfp:        gfp mask to use for any page allocation
 *
 * If the entry does not exist, allocate it.
 */
-static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
+static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info,
+                        unsigned long index, enum sgp_type sgp, gfp_t gfp)
 {
        struct inode *inode = &info->vfs_inode;
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
@@ -438,7 +448,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
                }
                spin_unlock(&info->lock);
-                page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
+                page = shmem_dir_alloc(gfp);
                spin_lock(&info->lock);
                if (!page) {
@@ -1228,92 +1238,83 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 #endif
 /*
- * shmem_getpage - either get the page from swap or allocate a new one
+ * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
 *
 * If we allocate a new one we do not mark it dirty. That's up to the
 * vm. If we swap it in we mark it dirty since we also free the swap
 * entry since a page cannot live in both the swap and page cache
 */
-static int shmem_getpage(struct inode *inode, unsigned long idx,
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx,
-                        struct page **pagep, enum sgp_type sgp, int *type)
+        struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
 {
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo;
-        struct page *filepage = *pagep;
+        struct page *page;
-        struct page *swappage;
        struct page *prealloc_page = NULL;
        swp_entry_t *entry;
        swp_entry_t swap;
-        gfp_t gfp;
        int error;
+        int ret;
        if (idx >= SHMEM_MAX_INDEX)
                return -EFBIG;
-        if (type)
-                *type = 0;
-        /*
-         * Normally, filepage is NULL on entry, and either found
-         * uptodate immediately, or allocated and zeroed, or read
-         * in under swappage, which is then assigned to filepage.
-         * But shmem_readpage (required for splice) passes in a locked
-         * filepage, which may be found not uptodate by other callers
-         * too, and may need to be copied from the swappage read in.
-         */
 repeat:
-        if (!filepage)
+        page = find_lock_page(mapping, idx);
-                filepage = find_lock_page(mapping, idx);
+        if (page) {
-        if (filepage && PageUptodate(filepage))
-                goto done;
-        gfp = mapping_gfp_mask(mapping);
-        if (!filepage) {
                /*
-                 * Try to preload while we can wait, to not make a habit of
+                 * Once we can get the page lock, it must be uptodate:
-                 * draining atomic reserves; but don't latch on to this cpu.
+                 * if there were an error in reading back from swap,
+                 * the page would not be inserted into the filecache.
                 */
-                error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+                BUG_ON(!PageUptodate(page));
-                if (error)
+                goto done;
-                        goto failed;
+        }
-                radix_tree_preload_end();
-                if (sgp != SGP_READ && !prealloc_page) {
+        /*
-                        /* We don't care if this fails */
+         * Try to preload while we can wait, to not make a habit of
-                        prealloc_page = shmem_alloc_page(gfp, info, idx);
+         * draining atomic reserves; but don't latch on to this cpu.
-                        if (prealloc_page) {
+         */
-                                if (mem_cgroup_cache_charge(prealloc_page,
+        error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
-                                                current->mm, GFP_KERNEL)) {
+        if (error)
-                                        page_cache_release(prealloc_page);
+                goto out;
-                                        prealloc_page = NULL;
+        radix_tree_preload_end();
-                                }
+        if (sgp != SGP_READ && !prealloc_page) {
+                prealloc_page = shmem_alloc_page(gfp, info, idx);
+                if (prealloc_page) {
+                        SetPageSwapBacked(prealloc_page);
+                        if (mem_cgroup_cache_charge(prealloc_page,
+                                        current->mm, GFP_KERNEL)) {
+                                page_cache_release(prealloc_page);
+                                prealloc_page = NULL;
                        }
                }
        }
-        error = 0;
        spin_lock(&info->lock);
        shmem_recalc_inode(inode);
-        entry = shmem_swp_alloc(info, idx, sgp);
+        entry = shmem_swp_alloc(info, idx, sgp, gfp);
        if (IS_ERR(entry)) {
                spin_unlock(&info->lock);
                error = PTR_ERR(entry);
-                goto failed;
+                goto out;
        }
        swap = *entry;
        if (swap.val) {
                /* Look it up and read it in.. */
-                swappage = lookup_swap_cache(swap);
+                page = lookup_swap_cache(swap);
-                if (!swappage) {
+                if (!page) {
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
                        /* here we actually do the io */
-                        if (type)
+                        if (fault_type)
-                                *type |= VM_FAULT_MAJOR;
+                                *fault_type |= VM_FAULT_MAJOR;
-                        swappage = shmem_swapin(swap, gfp, info, idx);
+                        page = shmem_swapin(swap, gfp, info, idx);
-                        if (!swappage) {
+                        if (!page) {
                                spin_lock(&info->lock);
-                                entry = shmem_swp_alloc(info, idx, sgp);
+                                entry = shmem_swp_alloc(info, idx, sgp, gfp);
                                if (IS_ERR(entry))
                                        error = PTR_ERR(entry);
                                else {
@@ -1323,62 +1324,42 @@ repeat:
                                }
                                spin_unlock(&info->lock);
                                if (error)
-                                        goto failed;
+                                        goto out;
                                goto repeat;
                        }
-                        wait_on_page_locked(swappage);
+                        wait_on_page_locked(page);
-                        page_cache_release(swappage);
+                        page_cache_release(page);
                        goto repeat;
                }
                /* We have to do this with page locked to prevent races */
-                if (!trylock_page(swappage)) {
+                if (!trylock_page(page)) {
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
-                        wait_on_page_locked(swappage);
+                        wait_on_page_locked(page);
-                        page_cache_release(swappage);
+                        page_cache_release(page);
                        goto repeat;
                }
-                if (PageWriteback(swappage)) {
+                if (PageWriteback(page)) {
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
-                        wait_on_page_writeback(swappage);
+                        wait_on_page_writeback(page);
-                        unlock_page(swappage);
+                        unlock_page(page);
-                        page_cache_release(swappage);
+                        page_cache_release(page);
                        goto repeat;
                }
-                if (!PageUptodate(swappage)) {
+                if (!PageUptodate(page)) {
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
-                        unlock_page(swappage);
+                        unlock_page(page);
-                        page_cache_release(swappage);
+                        page_cache_release(page);
                        error = -EIO;
-                        goto failed;
+                        goto out;
                }
-                if (filepage) {
+                error = add_to_page_cache_locked(page, mapping,
-                        shmem_swp_set(info, entry, 0);
+                                                 idx, GFP_NOWAIT);
-                        shmem_swp_unmap(entry);
+                if (error) {
-                        delete_from_swap_cache(swappage);
-                        spin_unlock(&info->lock);
-                        copy_highpage(filepage, swappage);
-                        unlock_page(swappage);
-                        page_cache_release(swappage);
-                        flush_dcache_page(filepage);
-                        SetPageUptodate(filepage);
-                        set_page_dirty(filepage);
-                        swap_free(swap);
-                } else if (!(error = add_to_page_cache_locked(swappage, mapping,
-                                        idx, GFP_NOWAIT))) {
-                        info->flags |= SHMEM_PAGEIN;
-                        shmem_swp_set(info, entry, 0);
-                        shmem_swp_unmap(entry);
-                        delete_from_swap_cache(swappage);
-                        spin_unlock(&info->lock);
-                        filepage = swappage;
-                        set_page_dirty(filepage);
-                        swap_free(swap);
-                } else {
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
                        if (error == -ENOMEM) {
@@ -1387,32 +1368,38 @@ repeat:
                                 * call memcg's OOM if needed.
                                 */
                                error = mem_cgroup_shmem_charge_fallback(
-                                                                swappage,
+                                                page, current->mm, gfp);
-                                                                current->mm,
-                                                                gfp);
                                if (error) {
-                                        unlock_page(swappage);
+                                        unlock_page(page);
-                                        page_cache_release(swappage);
+                                        page_cache_release(page);
-                                        goto failed;
+                                        goto out;
                                }
                        }
-                        unlock_page(swappage);
+                        unlock_page(page);
-                        page_cache_release(swappage);
+                        page_cache_release(page);
                        goto repeat;
                }
-        } else if (sgp == SGP_READ && !filepage) {
+                info->flags |= SHMEM_PAGEIN;
+                shmem_swp_set(info, entry, 0);
                shmem_swp_unmap(entry);
-                filepage = find_get_page(mapping, idx);
+                delete_from_swap_cache(page);
-                if (filepage &&
+                spin_unlock(&info->lock);
-                    (!PageUptodate(filepage) || !trylock_page(filepage))) {
+                set_page_dirty(page);
+                swap_free(swap);
+        } else if (sgp == SGP_READ) {
+                shmem_swp_unmap(entry);
+                page = find_get_page(mapping, idx);
+                if (page && !trylock_page(page)) {
                        spin_unlock(&info->lock);
-                        wait_on_page_locked(filepage);
+                        wait_on_page_locked(page);
-                        page_cache_release(filepage);
+                        page_cache_release(page);
-                        filepage = NULL;
                        goto repeat;
                }
                spin_unlock(&info->lock);
-        } else {
+        } else if (prealloc_page) {
                shmem_swp_unmap(entry);
                sbinfo = SHMEM_SB(inode->i_sb);
                if (sbinfo->max_blocks) {
@@ -1426,121 +1413,82 @@ repeat:
                        spin_unlock(&inode->i_lock);
                } else if (shmem_acct_block(info->flags))
                        goto nospace;
+                        
-                if (!filepage) {
+                        page = prealloc_page;
-                        int ret;
+                        prealloc_page = NULL;
+                        
-                        if (!prealloc_page) {
+                        entry = shmem_swp_alloc(info, idx, sgp, gfp);
-                                spin_unlock(&info->lock);
-                                filepage = shmem_alloc_page(gfp, info, idx);
-                                if (!filepage) {
-                                        shmem_unacct_blocks(info->flags, 1);
-                                        shmem_free_blocks(inode, 1);
-                                        error = -ENOMEM;
-                                        goto failed;
-                                }
-                                SetPageSwapBacked(filepage);
-                                /*
-                                 * Precharge page while we can wait, compensate
-                                 * after
-                                 */
-                                error = mem_cgroup_cache_charge(filepage,
-                                        current->mm, GFP_KERNEL);
-                                if (error) {
-                                        page_cache_release(filepage);
-                                        shmem_unacct_blocks(info->flags, 1);
-                                        shmem_free_blocks(inode, 1);
-                                        filepage = NULL;
-                                        goto failed;
-                                }
-                                spin_lock(&info->lock);
-                        } else {
-                                filepage = prealloc_page;
-                                prealloc_page = NULL;
-                                SetPageSwapBacked(filepage);
-                        }
-                        entry = shmem_swp_alloc(info, idx, sgp);
                        if (IS_ERR(entry))
-                                error = PTR_ERR(entry);
+                           error = PTR_ERR(entry);
-                        else {
+                        else {
-                                swap = *entry;
+                           swap = *entry;
-                                shmem_swp_unmap(entry);
+                           shmem_swp_unmap(entry);
-                        }
+                        }
-                        ret = error || swap.val;
+                        ret = error || swap.val;
-                        if (ret)
+                        if(ret)
-                                mem_cgroup_uncharge_cache_page(filepage);
+                           mem_cgroup_uncharge_cache_page(page);
-                        else
+                        else
-                                ret = add_to_page_cache_lru(filepage, mapping,
+                           ret = add_to_page_cache_lru(page, mapping, idx, GFP_NOWAIT);
-                                                idx, GFP_NOWAIT);
+                           /*
-                        /*
+                            * At add_to_page_cache_lru() failure,
-                         * At add_to_page_cache_lru() failure, uncharge will
+                            * uncharge will be done automatically.
-                         * be done automatically.
+                            */
-                         */
+                            if (ret) {
-                        if (ret) {
+                               shmem_unacct_blocks(info->flags, 1);
-                                spin_unlock(&info->lock);
+                               shmem_free_blocks(inode, 1);
-                                page_cache_release(filepage);
+                               spin_unlock(&info->lock);
-                                shmem_unacct_blocks(info->flags, 1);
+                               page_cache_release(page);
-                                shmem_free_blocks(inode, 1);
+                               if (error)
-                                filepage = NULL;
+                                   goto out;
-                                if (error)
+                               goto repeat;
-                                        goto failed;
-                                goto repeat;
-                        }
-                        info->flags |= SHMEM_PAGEIN;
                }
+                
+                info->flags |= SHMEM_PAGEIN;
                info->alloced++;
                spin_unlock(&info->lock);
-                clear_highpage(filepage);
+                clear_highpage(page);
-                flush_dcache_page(filepage);
+                flush_dcache_page(page);
-                SetPageUptodate(filepage);
+                SetPageUptodate(page);
                if (sgp == SGP_DIRTY)
-                        set_page_dirty(filepage);
+                        set_page_dirty(page);
+                        
+        } else {
+                spin_unlock(&info->lock);
+                error = -ENOMEM;
+                goto out;
        }
 done:
-        *pagep = filepage;
+        *pagep = page;
        error = 0;
-        goto out;
+out:
+        if (prealloc_page) {
+                mem_cgroup_uncharge_cache_page(prealloc_page);
+                page_cache_release(prealloc_page);
+        }
+        return error;
 nospace:
        /*
         * Perhaps the page was brought in from swap between find_lock_page
         * and taking info->lock?  We allow for that at add_to_page_cache_lru,
         * but must also avoid reporting a spurious ENOSPC while working on a
-         * full tmpfs.  (When filepage has been passed in to shmem_getpage, it
+         * full tmpfs.
-         * is already in page cache, which prevents this race from occurring.)
         */
-        if (!filepage) {
+        page = find_get_page(mapping, idx);
-                struct page *page = find_get_page(mapping, idx);
-                if (page) {
-                        spin_unlock(&info->lock);
-                        page_cache_release(page);
-                        goto repeat;
-                }
-        }
        spin_unlock(&info->lock);
-        error = -ENOSPC;
+        if (page) {
-failed:
+                page_cache_release(page);
-        if (*pagep != filepage) {
+                goto repeat;
-                unlock_page(filepage);
-                page_cache_release(filepage);
-        }
-out:
-        if (prealloc_page) {
-                mem_cgroup_uncharge_cache_page(prealloc_page);
-                page_cache_release(prealloc_page);
        }
-        return error;
+        error = -ENOSPC;
+        goto out;
 }
 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        int error;
-        int ret;
+        int ret = VM_FAULT_LOCKED;
        if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
                return VM_FAULT_SIGBUS;
@@ -1548,11 +1496,12 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
        if (error)
                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
        if (ret & VM_FAULT_MAJOR) {
                count_vm_event(PGMAJFAULT);
                mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
        }
-        return ret | VM_FAULT_LOCKED;
+        return ret;
 }
 #ifdef CONFIG_NUMA
@@ -1669,19 +1618,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_symlink_inline_operations;
-/*
- * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
- * but providing them allows a tmpfs file to be used for splice, sendfile, and
- * below the loop driver, in the generic fashion that many filesystems support.
- */
-static int shmem_readpage(struct file *file, struct page *page)
-{
-        struct inode *inode = page->mapping->host;
-        int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
-        unlock_page(page);
-        return error;
-}
 static int
 shmem_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
@@ -1689,7 +1625,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
 {
        struct inode *inode = mapping->host;
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = NULL;
        return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
 }
@@ -1846,6 +1781,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb,
        return retval;
 }
+static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
+                                struct pipe_inode_info *pipe, size_t len,
+                                unsigned int flags)
+{
+        struct address_space *mapping = in->f_mapping;
+        struct inode *inode = mapping->host;
+        unsigned int loff, nr_pages, req_pages;
+        struct page *pages[PIPE_DEF_BUFFERS];
+        struct partial_page partial[PIPE_DEF_BUFFERS];
+        struct page *page;
+        pgoff_t index, end_index;
+        loff_t isize, left;
+        int error, page_nr;
+        struct splice_pipe_desc spd = {
+                .pages = pages,
+                .partial = partial,
+                .flags = flags,
+                .ops = &page_cache_pipe_buf_ops,
+                .spd_release = spd_release_page,
+        };
+        isize = i_size_read(inode);
+        if (unlikely(*ppos >= isize))
+                return 0;
+        left = isize - *ppos;
+        if (unlikely(left < len))
+                len = left;
+        if (splice_grow_spd(pipe, &spd))
+                return -ENOMEM;
+        index = *ppos >> PAGE_CACHE_SHIFT;
+        loff = *ppos & ~PAGE_CACHE_MASK;
+        req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        nr_pages = min(req_pages, pipe->buffers);
+        spd.nr_pages = find_get_pages_contig(mapping, index,
+                                                nr_pages, spd.pages);
+        index += spd.nr_pages;
+        error = 0;
+        while (spd.nr_pages < nr_pages) {
+                error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
+                if (error)
+                        break;
+                unlock_page(page);
+                spd.pages[spd.nr_pages++] = page;
+                index++;
+        }
+        index = *ppos >> PAGE_CACHE_SHIFT;
+        nr_pages = spd.nr_pages;
+        spd.nr_pages = 0;
+        for (page_nr = 0; page_nr < nr_pages; page_nr++) {
+                unsigned int this_len;
+                if (!len)
+                        break;
+                this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+                page = spd.pages[page_nr];
+                if (!PageUptodate(page) || page->mapping != mapping) {
+                        error = shmem_getpage(inode, index, &page,
+                                                        SGP_CACHE, NULL);
+                        if (error)
+                                break;
+                        unlock_page(page);
+                        page_cache_release(spd.pages[page_nr]);
+                        spd.pages[page_nr] = page;
+                }
+                isize = i_size_read(inode);
+                end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+                if (unlikely(!isize || index > end_index))
+                        break;
+                if (end_index == index) {
+                        unsigned int plen;
+                        plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+                        if (plen <= loff)
+                                break;
+                        this_len = min(this_len, plen - loff);
+                        len = this_len;
+                }
+                spd.partial[page_nr].offset = loff;
+                spd.partial[page_nr].len = this_len;
+                len -= this_len;
+                loff = 0;
+                spd.nr_pages++;
+                index++;
+        }
+        while (page_nr < nr_pages)
+                page_cache_release(spd.pages[page_nr++]);
+        if (spd.nr_pages)
+                error = splice_to_pipe(pipe, &spd);
+        splice_shrink_spd(pipe, &spd);
+        if (error > 0) {
+                *ppos += error;
+                file_accessed(in);
+        }
+        return error;
+}
 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -2006,7 +2054,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        int error;
        int len;
        struct inode *inode;
-        struct page *page = NULL;
+        struct page *page;
        char *kaddr;
        struct shmem_inode_info *info;
@@ -2684,7 +2732,6 @@ static const struct address_space_operations shmem_aops = {
        .writepage      = shmem_writepage,
        .set_page_dirty = __set_page_dirty_no_writeback,
 #ifdef CONFIG_TMPFS
-        .readpage       = shmem_readpage,
        .write_begin    = shmem_write_begin,
        .write_end      = shmem_write_end,
 #endif
@@ -2701,7 +2748,7 @@ static const struct file_operations shmem_file_operations = {
        .aio_read       = shmem_file_aio_read,
        .aio_write      = generic_file_aio_write,
        .fsync          = noop_fsync,
-        .splice_read    = generic_file_splice_read,
+        .splice_read    = shmem_file_splice_read,
        .splice_write   = generic_file_splice_write,
 #endif
 };
@@ -3015,6 +3062,15 @@ put_memory:
 }
 EXPORT_SYMBOL_GPL(shmem_file_setup);
+void shmem_set_file(struct vm_area_struct *vma, struct file *file)
+{
+        if (vma->vm_file)
+                fput(vma->vm_file);
+        vma->vm_file = file;
+        vma->vm_ops = &shmem_vm_ops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
+}
 /**
 * shmem_zero_setup - setup a shared anonymous mapping
 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
@@ -3028,11 +3084,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
        if (IS_ERR(file))
                return PTR_ERR(file);
-        if (vma->vm_file)
+        shmem_set_file(vma, file);
-                fput(vma->vm_file);
-        vma->vm_file = file;
-        vma->vm_ops = &shmem_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
@@ -3048,13 +3100,29 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 * suit tmpfs, since it may have pages in swapcache, and needs to find those
 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
 *
- * Provide a stub for those callers to start using now, then later
+ * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
- * flesh it out to call shmem_getpage() with additional gfp mask, when
+ * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
- * shmem_file_splice_read() is added and shmem_readpage() is removed.
 */
 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                                         pgoff_t index, gfp_t gfp)
 {
+#ifdef CONFIG_SHMEM
+        struct inode *inode = mapping->host;
+        struct page *page;
+        int error;
+        BUG_ON(mapping->a_ops != &shmem_aops);
+        error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
+        if (error)
+                page = ERR_PTR(error);
+        else
+                unlock_page(page);
+        return page;
+#else
+        /*
+         * The tiny !SHMEM case uses ramfs without swap
+         */
        return read_cache_page_gfp(mapping, index, gfp);
+#endif
 }
 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/slab.c b/mm/slab.c
index d96e223de77..a67f8121ce5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3218,12 +3218,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
        if (in_interrupt() || (flags & __GFP_THISNODE))
                return NULL;
        nid_alloc = nid_here = numa_mem_id();
-        get_mems_allowed();
        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
                nid_alloc = cpuset_slab_spread_node();
        else if (current->mempolicy)
                nid_alloc = slab_node(current->mempolicy);
-        put_mems_allowed();
        if (nid_alloc != nid_here)
                return ____cache_alloc_node(cachep, flags, nid_alloc);
        return NULL;
@@ -3246,14 +3244,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
        enum zone_type high_zoneidx = gfp_zone(flags);
        void *obj = NULL;
        int nid;
+        unsigned int cpuset_mems_cookie;
        if (flags & __GFP_THISNODE)
                return NULL;
-        get_mems_allowed();
-        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
+retry_cpuset:
+        cpuset_mems_cookie = get_mems_allowed();
+        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
 retry:
        /*
         * Look through allowed nodes for objects available
@@ -3306,7 +3307,9 @@ retry:
                        }
                }
        }
-        put_mems_allowed();
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+                goto retry_cpuset;
        return obj;
 }
diff --git a/mm/slqb.c b/mm/slqb.c
new file mode 100644
index 00000000000..fbd2ebde3c3
--- /dev/null
+++ b/mm/slqb.c
@@ -0,0 +1,3816 @@
+/*
+ * SLQB: A slab allocator that focuses on per-CPU scaling, and good performance
+ * with order-0 allocations. Fastpaths emphasis is placed on local allocaiton
+ * and freeing, but with a secondary goal of good remote freeing (freeing on
+ * another CPU from that which allocated).
+ *
+ * Using ideas and code from mm/slab.c, mm/slob.c, and mm/slub.c.
+ */
+#include <linux/mm.h>
+#include <linux/swap.h> /* struct reclaim_state */
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/mempolicy.h>
+#include <linux/ctype.h>
+#include <linux/kallsyms.h>
+#include <linux/memory.h>
+#include <linux/fault-inject.h>
+/*
+ * TODO
+ * - fix up releasing of offlined data structures. Not a big deal because
+ *   they don't get cumulatively leaked with successive online/offline cycles
+ * - allow OOM conditions to flush back per-CPU pages to common lists to be
+ *   reused by other CPUs.
+ * - investiage performance with memoryless nodes. Perhaps CPUs can be given
+ *   a default closest home node via which it can use fastpath functions.
+ *   Perhaps it is not a big problem.
+ */
+/*
+ * slqb_page overloads struct page, and is used to manage some slob allocation
+ * aspects, however to avoid the horrible mess in include/linux/mm_types.h,
+ * we'll just define our own struct slqb_page type variant here.
+ */
+struct slqb_page {
+        union {
+                struct {
+                        unsigned long   flags;          /* mandatory */
+                        atomic_t        _count;         /* mandatory */
+                        unsigned int    inuse;          /* Nr of objects */
+                        struct kmem_cache_list *list;   /* Pointer to list */
+                        void             **freelist;    /* LIFO freelist */
+                        union {
+                                struct list_head lru;   /* misc. list */
+                                struct rcu_head rcu_head; /* for rcu freeing */
+                        };
+                };
+                struct page page;
+        };
+};
+static inline void struct_slqb_page_wrong_size(void)
+{ BUILD_BUG_ON(sizeof(struct slqb_page) != sizeof(struct page)); }
+#define PG_SLQB_BIT (1 << PG_slab)
+/*
+ * slqb_min_order: minimum allocation order for slabs
+ */
+static int slqb_min_order;
+/*
+ * slqb_min_objects: minimum number of objects per slab. Increasing this
+ * will increase the allocation order for slabs with larger objects
+ */
+static int slqb_min_objects = 1;
+#ifdef CONFIG_NUMA
+static inline int slab_numa(struct kmem_cache *s)
+{
+        return s->flags & SLAB_NUMA;
+}
+#else
+static inline int slab_numa(struct kmem_cache *s)
+{
+        return 0;
+}
+#endif
+static inline int slab_hiwater(struct kmem_cache *s)
+{
+        return s->hiwater;
+}
+static inline int slab_freebatch(struct kmem_cache *s)
+{
+        return s->freebatch;
+}
+/*
+ * Lock order:
+ * kmem_cache_node->list_lock
+ *   kmem_cache_remote_free->lock
+ *
+ * Data structures:
+ * SLQB is primarily per-cpu. For each kmem_cache, each CPU has:
+ *
+ * - A LIFO list of node-local objects. Allocation and freeing of node local
+ *   objects goes first to this list.
+ *
+ * - 2 Lists of slab pages, free and partial pages. If an allocation misses
+ *   the object list, it tries from the partial list, then the free list.
+ *   After freeing an object to the object list, if it is over a watermark,
+ *   some objects are freed back to pages. If an allocation misses these lists,
+ *   a new slab page is allocated from the page allocator. If the free list
+ *   reaches a watermark, some of its pages are returned to the page allocator.
+ *
+ * - A remote free queue, where objects freed that did not come from the local
+ *   node are queued to. When this reaches a watermark, the objects are
+ *   flushed.
+ *
+ * - A remotely freed queue, where objects allocated from this CPU are flushed
+ *   to from other CPUs' remote free queues. kmem_cache_remote_free->lock is
+ *   used to protect access to this queue.
+ *
+ *   When the remotely freed queue reaches a watermark, a flag is set to tell
+ *   the owner CPU to check it. The owner CPU will then check the queue on the
+ *   next allocation that misses the object list. It will move all objects from
+ *   this list onto the object list and then allocate one.
+ *
+ *   This system of remote queueing is intended to reduce lock and remote
+ *   cacheline acquisitions, and give a cooling off period for remotely freed
+ *   objects before they are re-allocated.
+ *
+ * node specific allocations from somewhere other than the local node are
+ * handled by a per-node list which is the same as the above per-CPU data
+ * structures except for the following differences:
+ *
+ * - kmem_cache_node->list_lock is used to protect access for multiple CPUs to
+ *   allocate from a given node.
+ *
+ * - There is no remote free queue. Nodes don't free objects, CPUs do.
+ */
+static inline void slqb_stat_inc(struct kmem_cache_list *list,
+                                enum stat_item si)
+{
+#ifdef CONFIG_SLQB_STATS
+        list->stats[si]++;
+#endif
+}
+static inline void slqb_stat_add(struct kmem_cache_list *list,
+                                enum stat_item si, unsigned long nr)
+{
+#ifdef CONFIG_SLQB_STATS
+        list->stats[si] += nr;
+#endif
+}
+static inline int slqb_page_to_nid(struct slqb_page *page)
+{
+        return page_to_nid(&page->page);
+}
+static inline void *slqb_page_address(struct slqb_page *page)
+{
+        return page_address(&page->page);
+}
+static inline struct zone *slqb_page_zone(struct slqb_page *page)
+{
+        return page_zone(&page->page);
+}
+static inline int virt_to_nid(const void *addr)
+{
+        return page_to_nid(virt_to_page(addr));
+}
+static inline struct slqb_page *virt_to_head_slqb_page(const void *addr)
+{
+        struct page *p;
+        p = virt_to_head_page(addr);
+        return (struct slqb_page *)p;
+}
+static inline void __free_slqb_pages(struct slqb_page *page, unsigned int order,
+                                        int pages)
+{
+        struct page *p = &page->page;
+        reset_page_mapcount(p);
+        p->mapping = NULL;
+        VM_BUG_ON(!(p->flags & PG_SLQB_BIT));
+        p->flags &= ~PG_SLQB_BIT;
+        if (current->reclaim_state)
+                current->reclaim_state->reclaimed_slab += pages;
+        __free_pages(p, order);
+}
+#ifdef CONFIG_SLQB_DEBUG
+static inline int slab_debug(struct kmem_cache *s)
+{
+        return s->flags &
+                        (SLAB_DEBUG_FREE |
+                         SLAB_RED_ZONE |
+                         SLAB_POISON |
+                         SLAB_STORE_USER |
+                         SLAB_TRACE);
+}
+static inline int slab_poison(struct kmem_cache *s)
+{
+        return s->flags & SLAB_POISON;
+}
+#else
+static inline int slab_debug(struct kmem_cache *s)
+{
+        return 0;
+}
+static inline int slab_poison(struct kmem_cache *s)
+{
+        return 0;
+}
+#endif
+#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
+                                SLAB_POISON | SLAB_STORE_USER)
+/* Internal SLQB flags */
+#define __OBJECT_POISON         0x80000000 /* Poison object */
+/* Not all arches define cache_line_size */
+#ifndef cache_line_size
+#define cache_line_size()       L1_CACHE_BYTES
+#endif
+#ifdef CONFIG_SMP
+static struct notifier_block slab_notifier;
+#endif
+/*
+ * slqb_lock protects slab_caches list and serialises hotplug operations.
+ * hotplug operations take lock for write, other operations can hold off
+ * hotplug by taking it for read (or write).
+ */
+static DECLARE_RWSEM(slqb_lock);
+/*
+ * A list of all slab caches on the system
+ */
+static LIST_HEAD(slab_caches);
+/*
+ * Tracking user of a slab.
+ */
+struct track {
+        unsigned long addr;     /* Called from address */
+        int cpu;                /* Was running on cpu */
+        int pid;                /* Pid context */
+        unsigned long when;     /* When did the operation occur */
+};
+enum track_item { TRACK_ALLOC, TRACK_FREE };
+static struct kmem_cache kmem_cache_cache;
+#ifdef CONFIG_SLQB_SYSFS
+static int sysfs_slab_add(struct kmem_cache *s);
+static void sysfs_slab_remove(struct kmem_cache *s);
+#else
+static inline int sysfs_slab_add(struct kmem_cache *s)
+{
+        return 0;
+}
+static inline void sysfs_slab_remove(struct kmem_cache *s)
+{
+        kmem_cache_free(&kmem_cache_cache, s);
+}
+#endif
+/********************************************************************
+ *                      Core slab cache functions
+ *******************************************************************/
+static int __slab_is_available __read_mostly;
+int slab_is_available(void)
+{
+        return __slab_is_available;
+}
+static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
+{
+#ifdef CONFIG_SMP
+        VM_BUG_ON(!s->cpu_slab[cpu]);
+        return s->cpu_slab[cpu];
+#else
+        return &s->cpu_slab;
+#endif
+}
+static inline int check_valid_pointer(struct kmem_cache *s,
+                                struct slqb_page *page, const void *object)
+{
+        void *base;
+        base = slqb_page_address(page);
+        if (object < base || object >= base + s->objects * s->size ||
+                (object - base) % s->size) {
+                return 0;
+        }
+        return 1;
+}
+static inline void *get_freepointer(struct kmem_cache *s, void *object)
+{
+        return *(void **)(object + s->offset);
+}
+static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
+{
+        *(void **)(object + s->offset) = fp;
+}
+/* Loop over all objects in a slab */
+#define for_each_object(__p, __s, __addr) \
+        for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
+                        __p += (__s)->size)
+/* Scan freelist */
+#define for_each_free_object(__p, __s, __free) \
+        for (__p = (__free); (__p) != NULL; __p = get_freepointer((__s),\
+                __p))
+#ifdef CONFIG_SLQB_DEBUG
+/*
+ * Debug settings:
+ */
+#ifdef CONFIG_SLQB_DEBUG_ON
+static int slqb_debug __read_mostly = DEBUG_DEFAULT_FLAGS;
+#else
+static int slqb_debug __read_mostly;
+#endif
+static char *slqb_debug_slabs;
+/*
+ * Object debugging
+ */
+static void print_section(char *text, u8 *addr, unsigned int length)
+{
+        int i, offset;
+        int newline = 1;
+        char ascii[17];
+        ascii[16] = 0;
+        for (i = 0; i < length; i++) {
+                if (newline) {
+                        printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
+                        newline = 0;
+                }
+                printk(KERN_CONT " %02x", addr[i]);
+                offset = i % 16;
+                ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
+                if (offset == 15) {
+                        printk(KERN_CONT " %s\n", ascii);
+                        newline = 1;
+                }
+        }
+        if (!newline) {
+                i %= 16;
+                while (i < 16) {
+                        printk(KERN_CONT "   ");
+                        ascii[i] = ' ';
+                        i++;
+                }
+                printk(KERN_CONT " %s\n", ascii);
+        }
+}
+static struct track *get_track(struct kmem_cache *s, void *object,
+        enum track_item alloc)
+{
+        struct track *p;
+        if (s->offset)
+                p = object + s->offset + sizeof(void *);
+        else
+                p = object + s->inuse;
+        return p + alloc;
+}
+static void set_track(struct kmem_cache *s, void *object,
+                                enum track_item alloc, unsigned long addr)
+{
+        struct track *p;
+        if (s->offset)
+                p = object + s->offset + sizeof(void *);
+        else
+                p = object + s->inuse;
+        p += alloc;
+        if (addr) {
+                p->addr = addr;
+                p->cpu = raw_smp_processor_id();
+                p->pid = current ? current->pid : -1;
+                p->when = jiffies;
+        } else
+                memset(p, 0, sizeof(struct track));
+}
+static void init_tracking(struct kmem_cache *s, void *object)
+{
+        if (!(s->flags & SLAB_STORE_USER))
+                return;
+        set_track(s, object, TRACK_FREE, 0UL);
+        set_track(s, object, TRACK_ALLOC, 0UL);
+}
+static void print_track(const char *s, struct track *t)
+{
+        if (!t->addr)
+                return;
+        printk(KERN_ERR "INFO: %s in ", s);
+        __print_symbol("%s", (unsigned long)t->addr);
+        printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
+}
+static void print_tracking(struct kmem_cache *s, void *object)
+{
+        if (!(s->flags & SLAB_STORE_USER))
+                return;
+        print_track("Allocated", get_track(s, object, TRACK_ALLOC));
+        print_track("Freed", get_track(s, object, TRACK_FREE));
+}
+static void print_page_info(struct slqb_page *page)
+{
+        printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n",
+                page, page->inuse, page->freelist, page->flags);
+}
+#define MAX_ERR_STR 100
+static void slab_bug(struct kmem_cache *s, char *fmt, ...)
+{
+        va_list args;
+        char buf[MAX_ERR_STR];
+        va_start(args, fmt);
+        vsnprintf(buf, sizeof(buf), fmt, args);
+        va_end(args);
+        printk(KERN_ERR "========================================"
+                        "=====================================\n");
+        printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
+        printk(KERN_ERR "----------------------------------------"
+                        "-------------------------------------\n\n");
+}
+static void slab_fix(struct kmem_cache *s, char *fmt, ...)
+{
+        va_list args;
+        char buf[100];
+        va_start(args, fmt);
+        vsnprintf(buf, sizeof(buf), fmt, args);
+        va_end(args);
+        printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
+}
+static void print_trailer(struct kmem_cache *s, struct slqb_page *page, u8 *p)
+{
+        unsigned int off;       /* Offset of last byte */
+        u8 *addr = slqb_page_address(page);
+        print_tracking(s, p);
+        print_page_info(page);
+        printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+                        p, p - addr, get_freepointer(s, p));
+        if (p > addr + 16)
+                print_section("Bytes b4", p - 16, 16);
+        print_section("Object", p, min(s->objsize, 128));
+        if (s->flags & SLAB_RED_ZONE)
+                print_section("Redzone", p + s->objsize, s->inuse - s->objsize);
+        if (s->offset)
+                off = s->offset + sizeof(void *);
+        else
+                off = s->inuse;
+        if (s->flags & SLAB_STORE_USER)
+                off += 2 * sizeof(struct track);
+        if (off != s->size) {
+                /* Beginning of the filler is the free pointer */
+                print_section("Padding", p + off, s->size - off);
+        }
+        dump_stack();
+}
+static void object_err(struct kmem_cache *s, struct slqb_page *page,
+                        u8 *object, char *reason)
+{
+        slab_bug(s, reason);
+        print_trailer(s, page, object);
+}
+static void slab_err(struct kmem_cache *s, struct slqb_page *page,
+                        char *fmt, ...)
+{
+        slab_bug(s, fmt);
+        print_page_info(page);
+        dump_stack();
+}
+static void init_object(struct kmem_cache *s, void *object, int active)
+{
+        u8 *p = object;
+        if (s->flags & __OBJECT_POISON) {
+                memset(p, POISON_FREE, s->objsize - 1);
+                p[s->objsize - 1] = POISON_END;
+        }
+        if (s->flags & SLAB_RED_ZONE) {
+                memset(p + s->objsize,
+                        active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
+                        s->inuse - s->objsize);
+        }
+}
+static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
+{
+        while (bytes) {
+                if (*start != (u8)value)
+                        return start;
+                start++;
+                bytes--;
+        }
+        return NULL;
+}
+static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
+                                void *from, void *to)
+{
+        slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
+        memset(from, data, to - from);
+}
+static int check_bytes_and_report(struct kmem_cache *s, struct slqb_page *page,
+                        u8 *object, char *what,
+                        u8 *start, unsigned int value, unsigned int bytes)
+{
+        u8 *fault;
+        u8 *end;
+        fault = check_bytes(start, value, bytes);
+        if (!fault)
+                return 1;
+        end = start + bytes;
+        while (end > fault && end[-1] == value)
+                end--;
+        slab_bug(s, "%s overwritten", what);
+        printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
+                                        fault, end - 1, fault[0], value);
+        print_trailer(s, page, object);
+        restore_bytes(s, what, value, fault, end);
+        return 0;
+}
+/*
+ * Object layout:
+ *
+ * object address
+ *      Bytes of the object to be managed.
+ *      If the freepointer may overlay the object then the free
+ *      pointer is the first word of the object.
+ *
+ *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
+ *      0xa5 (POISON_END)
+ *
+ * object + s->objsize
+ *      Padding to reach word boundary. This is also used for Redzoning.
+ *      Padding is extended by another word if Redzoning is enabled and
+ *      objsize == inuse.
+ *
+ *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
+ *      0xcc (RED_ACTIVE) for objects in use.
+ *
+ * object + s->inuse
+ *      Meta data starts here.
+ *
+ *      A. Free pointer (if we cannot overwrite object on free)
+ *      B. Tracking data for SLAB_STORE_USER
+ *      C. Padding to reach required alignment boundary or at mininum
+ *              one word if debuggin is on to be able to detect writes
+ *              before the word boundary.
+ *
+ *      Padding is done using 0x5a (POISON_INUSE)
+ *
+ * object + s->size
+ *      Nothing is used beyond s->size.
+ */
+static int check_pad_bytes(struct kmem_cache *s, struct slqb_page *page, u8 *p)
+{
+        unsigned long off = s->inuse;   /* The end of info */
+        if (s->offset) {
+                /* Freepointer is placed after the object. */
+                off += sizeof(void *);
+        }
+        if (s->flags & SLAB_STORE_USER) {
+                /* We also have user information there */
+                off += 2 * sizeof(struct track);
+        }
+        if (s->size == off)
+                return 1;
+        return check_bytes_and_report(s, page, p, "Object padding",
+                                p + off, POISON_INUSE, s->size - off);
+}
+static int slab_pad_check(struct kmem_cache *s, struct slqb_page *page)
+{
+        u8 *start;
+        u8 *fault;
+        u8 *end;
+        int length;
+        int remainder;
+        if (!(s->flags & SLAB_POISON))
+                return 1;
+        start = slqb_page_address(page);
+        end = start + (PAGE_SIZE << s->order);
+        length = s->objects * s->size;
+        remainder = end - (start + length);
+        if (!remainder)
+                return 1;
+        fault = check_bytes(start + length, POISON_INUSE, remainder);
+        if (!fault)
+                return 1;
+        while (end > fault && end[-1] == POISON_INUSE)
+                end--;
+        slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
+        print_section("Padding", start, length);
+        restore_bytes(s, "slab padding", POISON_INUSE, start, end);
+        return 0;
+}
+static int check_object(struct kmem_cache *s, struct slqb_page *page,
+                                        void *object, int active)
+{
+        u8 *p = object;
+        u8 *endobject = object + s->objsize;
+        if (s->flags & SLAB_RED_ZONE) {
+                unsigned int red =
+                        active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
+                if (!check_bytes_and_report(s, page, object, "Redzone",
+                        endobject, red, s->inuse - s->objsize))
+                        return 0;
+        } else {
+                if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
+                        check_bytes_and_report(s, page, p, "Alignment padding",
+                                endobject, POISON_INUSE, s->inuse - s->objsize);
+                }
+        }
+        if (s->flags & SLAB_POISON) {
+                if (!active && (s->flags & __OBJECT_POISON)) {
+                        if (!check_bytes_and_report(s, page, p, "Poison", p,
+                                        POISON_FREE, s->objsize - 1))
+                                return 0;
+                        if (!check_bytes_and_report(s, page, p, "Poison",
+                                        p + s->objsize - 1, POISON_END, 1))
+                                return 0;
+                }
+                /*
+                 * check_pad_bytes cleans up on its own.
+                 */
+                check_pad_bytes(s, page, p);
+        }
+        return 1;
+}
+static int check_slab(struct kmem_cache *s, struct slqb_page *page)
+{
+        if (!(page->flags & PG_SLQB_BIT)) {
+                slab_err(s, page, "Not a valid slab page");
+                return 0;
+        }
+        if (page->inuse == 0) {
+                slab_err(s, page, "inuse before free / after alloc", s->name);
+                return 0;
+        }
+        if (page->inuse > s->objects) {
+                slab_err(s, page, "inuse %u > max %u",
+                        s->name, page->inuse, s->objects);
+                return 0;
+        }
+        /* Slab_pad_check fixes things up after itself */
+        slab_pad_check(s, page);
+        return 1;
+}
+static void trace(struct kmem_cache *s, struct slqb_page *page,
+                        void *object, int alloc)
+{
+        if (s->flags & SLAB_TRACE) {
+                printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
+                        s->name,
+                        alloc ? "alloc" : "free",
+                        object, page->inuse,
+                        page->freelist);
+                if (!alloc)
+                        print_section("Object", (void *)object, s->objsize);
+                dump_stack();
+        }
+}
+static void setup_object_debug(struct kmem_cache *s, struct slqb_page *page,
+                                void *object)
+{
+        if (!slab_debug(s))
+                return;
+        if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
+                return;
+        init_object(s, object, 0);
+        init_tracking(s, object);
+}
+static int alloc_debug_processing(struct kmem_cache *s,
+                                        void *object, unsigned long addr)
+{
+        struct slqb_page *page;
+        page = virt_to_head_slqb_page(object);
+        if (!check_slab(s, page))
+                goto bad;
+        if (!check_valid_pointer(s, page, object)) {
+                object_err(s, page, object, "Freelist Pointer check fails");
+                goto bad;
+        }
+        if (object && !check_object(s, page, object, 0))
+                goto bad;
+        /* Success perform special debug activities for allocs */
+        if (s->flags & SLAB_STORE_USER)
+                set_track(s, object, TRACK_ALLOC, addr);
+        trace(s, page, object, 1);
+        init_object(s, object, 1);
+        return 1;
+bad:
+        return 0;
+}
+static int free_debug_processing(struct kmem_cache *s,
+                                        void *object, unsigned long addr)
+{
+        struct slqb_page *page;
+        page = virt_to_head_slqb_page(object);
+        if (!check_slab(s, page))
+                goto fail;
+        if (!check_valid_pointer(s, page, object)) {
+                slab_err(s, page, "Invalid object pointer 0x%p", object);
+                goto fail;
+        }
+        if (!check_object(s, page, object, 1))
+                return 0;
+        /* Special debug activities for freeing objects */
+        if (s->flags & SLAB_STORE_USER)
+                set_track(s, object, TRACK_FREE, addr);
+        trace(s, page, object, 0);
+        init_object(s, object, 0);
+        return 1;
+fail:
+        slab_fix(s, "Object at 0x%p not freed", object);
+        return 0;
+}
+static int __init setup_slqb_debug(char *str)
+{
+        slqb_debug = DEBUG_DEFAULT_FLAGS;
+        if (*str++ != '=' || !*str) {
+                /*
+                 * No options specified. Switch on full debugging.
+                 */
+                goto out;
+        }
+        if (*str == ',') {
+                /*
+                 * No options but restriction on slabs. This means full
+                 * debugging for slabs matching a pattern.
+                 */
+                goto check_slabs;
+        }
+        slqb_debug = 0;
+        if (*str == '-') {
+                /*
+                 * Switch off all debugging measures.
+                 */
+                goto out;
+        }
+        /*
+         * Determine which debug features should be switched on
+         */
+        for (; *str && *str != ','; str++) {
+                switch (tolower(*str)) {
+                case 'f':
+                        slqb_debug |= SLAB_DEBUG_FREE;
+                        break;
+                case 'z':
+                        slqb_debug |= SLAB_RED_ZONE;
+                        break;
+                case 'p':
+                        slqb_debug |= SLAB_POISON;
+                        break;
+                case 'u':
+                        slqb_debug |= SLAB_STORE_USER;
+                        break;
+                case 't':
+                        slqb_debug |= SLAB_TRACE;
+                        break;
+                case 'a':
+                        slqb_debug |= SLAB_FAILSLAB;
+                        break;
+                default:
+                        printk(KERN_ERR "slqb_debug option '%c' "
+                                "unknown. skipped\n", *str);
+                }
+        }
+check_slabs:
+        if (*str == ',')
+                slqb_debug_slabs = str + 1;
+out:
+        return 1;
+}
+__setup("slqb_debug", setup_slqb_debug);
+static int __init setup_slqb_min_order(char *str)
+{
+        get_option(&str, &slqb_min_order);
+        slqb_min_order = min(slqb_min_order, MAX_ORDER - 1);
+        return 1;
+}
+__setup("slqb_min_order=", setup_slqb_min_order);
+static int __init setup_slqb_min_objects(char *str)
+{
+        get_option(&str, &slqb_min_objects);
+        return 1;
+}
+__setup("slqb_min_objects=", setup_slqb_min_objects);
+static unsigned long kmem_cache_flags(unsigned long objsize,
+                                unsigned long flags, const char *name,
+                                void (*ctor)(void *))
+{
+        /*
+         * Enable debugging if selected on the kernel commandline.
+         */
+        if (slqb_debug && (!slqb_debug_slabs ||
+            strncmp(slqb_debug_slabs, name,
+                strlen(slqb_debug_slabs)) == 0))
+                        flags |= slqb_debug;
+        if (num_possible_nodes() > 1)
+                flags |= SLAB_NUMA;
+        return flags;
+}
+#else
+static inline void setup_object_debug(struct kmem_cache *s,
+                        struct slqb_page *page, void *object)
+{
+}
+static inline int alloc_debug_processing(struct kmem_cache *s,
+                        void *object, unsigned long addr)
+{
+        return 0;
+}
+static inline int free_debug_processing(struct kmem_cache *s,
+                        void *object, unsigned long addr)
+{
+        return 0;
+}
+static inline int slab_pad_check(struct kmem_cache *s, struct slqb_page *page)
+{
+        return 1;
+}
+static inline int check_object(struct kmem_cache *s, struct slqb_page *page,
+                        void *object, int active)
+{
+        return 1;
+}
+static inline void add_full(struct kmem_cache_node *n, struct slqb_page *page)
+{
+}
+static inline unsigned long kmem_cache_flags(unsigned long objsize,
+        unsigned long flags, const char *name, void (*ctor)(void *))
+{
+        if (num_possible_nodes() > 1)
+                flags |= SLAB_NUMA;
+        return flags;
+}
+static const int slqb_debug;
+#endif
+/*
+ * allocate a new slab (return its corresponding struct slqb_page)
+ */
+static struct slqb_page *allocate_slab(struct kmem_cache *s,
+                                        gfp_t flags, int node)
+{
+        struct slqb_page *page;
+        int pages = 1 << s->order;
+        flags |= s->allocflags;
+        page = (struct slqb_page *)alloc_pages_node(node, flags, s->order);
+        if (!page)
+                return NULL;
+        mod_zone_page_state(slqb_page_zone(page),
+                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+                pages);
+        return page;
+}
+/*
+ * Called once for each object on a new slab page
+ */
+static void setup_object(struct kmem_cache *s,
+                                struct slqb_page *page, void *object)
+{
+        setup_object_debug(s, page, object);
+        if (unlikely(s->ctor))
+                s->ctor(object);
+}
+/*
+ * Allocate a new slab, set up its object list.
+ */
+static struct slqb_page *new_slab_page(struct kmem_cache *s,
+                                gfp_t flags, int node, unsigned int colour)
+{
+        struct slqb_page *page;
+        void *start;
+        void *last;
+        void *p;
+        BUG_ON(flags & GFP_SLAB_BUG_MASK);
+        page = allocate_slab(s,
+                flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
+        if (!page)
+                goto out;
+        page->flags |= PG_SLQB_BIT;
+        start = page_address(&page->page);
+        if (unlikely(slab_poison(s)))
+                memset(start, POISON_INUSE, PAGE_SIZE << s->order);
+        start += colour;
+        last = start;
+        for_each_object(p, s, start) {
+                setup_object(s, page, p);
+                set_freepointer(s, last, p);
+                last = p;
+        }
+        set_freepointer(s, last, NULL);
+        page->freelist = start;
+        page->inuse = 0;
+out:
+        return page;
+}
+/*
+ * Free a slab page back to the page allocator
+ */
+static void __free_slab(struct kmem_cache *s, struct slqb_page *page)
+{
+        int pages = 1 << s->order;
+        if (unlikely(slab_debug(s))) {
+                void *p;
+                slab_pad_check(s, page);
+                for_each_free_object(p, s, page->freelist)
+                        check_object(s, page, p, 0);
+        }
+        mod_zone_page_state(slqb_page_zone(page),
+                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+                -pages);
+        __free_slqb_pages(page, s->order, pages);
+}
+static void rcu_free_slab(struct rcu_head *h)
+{
+        struct slqb_page *page;
+        page = container_of(h, struct slqb_page, rcu_head);
+        __free_slab(page->list->cache, page);
+}
+static void free_slab(struct kmem_cache *s, struct slqb_page *page)
+{
+        VM_BUG_ON(page->inuse);
+        if (unlikely(s->flags & SLAB_DESTROY_BY_RCU))
+                call_rcu(&page->rcu_head, rcu_free_slab);
+        else
+                __free_slab(s, page);
+}
+/*
+ * Return an object to its slab.
+ *
+ * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
+ * list_lock in the case of per-node list.
+ */
+static int free_object_to_page(struct kmem_cache *s,
+                        struct kmem_cache_list *l, struct slqb_page *page,
+                        void *object)
+{
+        VM_BUG_ON(page->list != l);
+        set_freepointer(s, object, page->freelist);
+        page->freelist = object;
+        page->inuse--;
+        if (!page->inuse) {
+                if (likely(s->objects > 1)) {
+                        l->nr_partial--;
+                        list_del(&page->lru);
+                }
+                l->nr_slabs--;
+                free_slab(s, page);
+                slqb_stat_inc(l, FLUSH_SLAB_FREE);
+                return 1;
+        } else if (page->inuse + 1 == s->objects) {
+                l->nr_partial++;
+                list_add(&page->lru, &l->partial);
+                slqb_stat_inc(l, FLUSH_SLAB_PARTIAL);
+                return 0;
+        }
+        return 0;
+}
+#ifdef CONFIG_SMP
+static void slab_free_to_remote(struct kmem_cache *s, struct slqb_page *page,
+                                void *object, struct kmem_cache_cpu *c);
+#endif
+/*
+ * Flush the LIFO list of objects on a list. They are sent back to their pages
+ * in case the pages also belong to the list, or to our CPU's remote-free list
+ * in the case they do not.
+ *
+ * Doesn't flush the entire list. flush_free_list_all does.
+ *
+ * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
+ * list_lock in the case of per-node list.
+ */
+static void flush_free_list(struct kmem_cache *s, struct kmem_cache_list *l)
+{
+        void **head;
+        int nr;
+        int locked = 0;
+        nr = l->freelist.nr;
+        if (unlikely(!nr))
+                return;
+        nr = min(slab_freebatch(s), nr);
+        slqb_stat_inc(l, FLUSH_FREE_LIST);
+        slqb_stat_add(l, FLUSH_FREE_LIST_OBJECTS, nr);
+        l->freelist.nr -= nr;
+        head = l->freelist.head;
+        do {
+                struct slqb_page *page;
+                void **object;
+                object = head;
+                VM_BUG_ON(!object);
+                head = get_freepointer(s, object);
+                page = virt_to_head_slqb_page(object);
+#ifdef CONFIG_SMP
+                if (page->list != l) {
+                        struct kmem_cache_cpu *c;
+                        if (locked) {
+                                spin_unlock(&l->page_lock);
+                                locked = 0;
+                        }
+                        c = get_cpu_slab(s, smp_processor_id());
+                        slab_free_to_remote(s, page, object, c);
+                        slqb_stat_inc(l, FLUSH_FREE_LIST_REMOTE);
+                } else
+#endif
+                {
+                        if (!locked) {
+                                spin_lock(&l->page_lock);
+                                locked = 1;
+                        }
+                        free_object_to_page(s, l, page, object);
+                }
+                nr--;
+        } while (nr);
+        if (locked)
+                spin_unlock(&l->page_lock);
+        l->freelist.head = head;
+        if (!l->freelist.nr)
+                l->freelist.tail = NULL;
+}
+static void flush_free_list_all(struct kmem_cache *s, struct kmem_cache_list *l)
+{
+        while (l->freelist.nr)
+                flush_free_list(s, l);
+}
+#ifdef CONFIG_SMP
+/*
+ * If enough objects have been remotely freed back to this list,
+ * remote_free_check will be set. In which case, we'll eventually come here
+ * to take those objects off our remote_free list and onto our LIFO freelist.
+ *
+ * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
+ * list_lock in the case of per-node list.
+ */
+static void claim_remote_free_list(struct kmem_cache *s,
+                                        struct kmem_cache_list *l)
+{
+        void **head, **tail;
+        int nr;
+        if (!l->remote_free.list.nr)
+                return;
+        spin_lock(&l->remote_free.lock);
+        l->remote_free_check = 0;
+        head = l->remote_free.list.head;
+        l->remote_free.list.head = NULL;
+        tail = l->remote_free.list.tail;
+        l->remote_free.list.tail = NULL;
+        nr = l->remote_free.list.nr;
+        l->remote_free.list.nr = 0;
+        spin_unlock(&l->remote_free.lock);
+        VM_BUG_ON(!nr);
+        if (!l->freelist.nr) {
+                /* Get head hot for likely subsequent allocation or flush */
+                prefetchw(head);
+                l->freelist.head = head;
+        } else
+                set_freepointer(s, l->freelist.tail, head);
+        l->freelist.tail = tail;
+        l->freelist.nr += nr;
+        slqb_stat_inc(l, CLAIM_REMOTE_LIST);
+        slqb_stat_add(l, CLAIM_REMOTE_LIST_OBJECTS, nr);
+}
+#else
+static inline void claim_remote_free_list(struct kmem_cache *s,
+                                        struct kmem_cache_list *l)
+{
+}
+#endif
+/*
+ * Allocation fastpath. Get an object from the list's LIFO freelist, or
+ * return NULL if it is empty.
+ *
+ * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
+ * list_lock in the case of per-node list.
+ */
+static __always_inline void *__cache_list_get_object(struct kmem_cache *s,
+                                                struct kmem_cache_list *l)
+{
+        void *object;
+        object = l->freelist.head;
+        if (likely(object)) {
+                void *next = get_freepointer(s, object);
+                VM_BUG_ON(!l->freelist.nr);
+                l->freelist.nr--;
+                l->freelist.head = next;
+                return object;
+        }
+        VM_BUG_ON(l->freelist.nr);
+#ifdef CONFIG_SMP
+        if (unlikely(l->remote_free_check)) {
+                claim_remote_free_list(s, l);
+                if (l->freelist.nr > slab_hiwater(s))
+                        flush_free_list(s, l);
+                /* repetition here helps gcc :( */
+                object = l->freelist.head;
+                if (likely(object)) {
+                        void *next = get_freepointer(s, object);
+                        VM_BUG_ON(!l->freelist.nr);
+                        l->freelist.nr--;
+                        l->freelist.head = next;
+                        return object;
+                }
+                VM_BUG_ON(l->freelist.nr);
+        }
+#endif
+        return NULL;
+}
+/*
+ * Slow(er) path. Get a page from this list's existing pages. Will be a
+ * new empty page in the case that __slab_alloc_page has just been called
+ * (empty pages otherwise never get queued up on the lists), or a partial page
+ * already on the list.
+ *
+ * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
+ * list_lock in the case of per-node list.
+ */
+static noinline void *__cache_list_get_page(struct kmem_cache *s,
+                                struct kmem_cache_list *l)
+{
+        struct slqb_page *page;
+        void *object;
+        if (unlikely(!l->nr_partial))
+                return NULL;
+        page = list_first_entry(&l->partial, struct slqb_page, lru);
+        VM_BUG_ON(page->inuse == s->objects);
+        if (page->inuse + 1 == s->objects) {
+                l->nr_partial--;
+                list_del(&page->lru);
+        }
+        VM_BUG_ON(!page->freelist);
+        page->inuse++;
+        object = page->freelist;
+        page->freelist = get_freepointer(s, object);
+        if (page->freelist)
+                prefetchw(page->freelist);
+        VM_BUG_ON((page->inuse == s->objects) != (page->freelist == NULL));
+        slqb_stat_inc(l, ALLOC_SLAB_FILL);
+        return object;
+}
+static void *cache_list_get_page(struct kmem_cache *s,
+                                struct kmem_cache_list *l)
+{
+        void *object;
+        if (unlikely(!l->nr_partial))
+                return NULL;
+        spin_lock(&l->page_lock);
+        object = __cache_list_get_page(s, l);
+        spin_unlock(&l->page_lock);
+        return object;
+}
+/*
+ * Allocation slowpath. Allocate a new slab page from the page allocator, and
+ * put it on the list's partial list. Must be followed by an allocation so
+ * that we don't have dangling empty pages on the partial list.
+ *
+ * Returns 0 on allocation failure.
+ *
+ * Must be called with interrupts disabled.
+ */
+static noinline void *__slab_alloc_page(struct kmem_cache *s,
+                                gfp_t gfpflags, int node)
+{
+        struct slqb_page *page;
+        struct kmem_cache_list *l;
+        struct kmem_cache_cpu *c;
+        unsigned int colour;
+        void *object;
+        c = get_cpu_slab(s, smp_processor_id());
+        colour = c->colour_next;
+        c->colour_next += s->colour_off;
+        if (c->colour_next >= s->colour_range)
+                c->colour_next = 0;
+        /* Caller handles __GFP_ZERO */
+        gfpflags &= ~__GFP_ZERO;
+        if (gfpflags & __GFP_WAIT)
+                local_irq_enable();
+        page = new_slab_page(s, gfpflags, node, colour);
+        if (gfpflags & __GFP_WAIT)
+                local_irq_disable();
+        if (unlikely(!page))
+                return page;
+        if (!NUMA_BUILD || likely(slqb_page_to_nid(page) == numa_node_id())) {
+                struct kmem_cache_cpu *c;
+                int cpu = smp_processor_id();
+                c = get_cpu_slab(s, cpu);
+                l = &c->list;
+                page->list = l;
+                spin_lock(&l->page_lock);
+                l->nr_slabs++;
+                l->nr_partial++;
+                list_add(&page->lru, &l->partial);
+                slqb_stat_inc(l, ALLOC);
+                slqb_stat_inc(l, ALLOC_SLAB_NEW);
+                object = __cache_list_get_page(s, l);
+                spin_unlock(&l->page_lock);
+        } else {
+#ifdef CONFIG_NUMA
+                struct kmem_cache_node *n;
+                n = s->node_slab[slqb_page_to_nid(page)];
+                l = &n->list;
+                page->list = l;
+                spin_lock(&n->list_lock);
+                spin_lock(&l->page_lock);
+                l->nr_slabs++;
+                l->nr_partial++;
+                list_add(&page->lru, &l->partial);
+                slqb_stat_inc(l, ALLOC);
+                slqb_stat_inc(l, ALLOC_SLAB_NEW);
+                object = __cache_list_get_page(s, l);
+                spin_unlock(&l->page_lock);
+                spin_unlock(&n->list_lock);
+#endif
+        }
+        VM_BUG_ON(!object);
+        return object;
+}
+#ifdef CONFIG_NUMA
+static noinline int alternate_nid(struct kmem_cache *s,
+                                gfp_t gfpflags, int node)
+{
+        if (in_interrupt() || (gfpflags & __GFP_THISNODE))
+                return node;
+        if (cpuset_do_slab_mem_spread() && (s->flags & SLAB_MEM_SPREAD))
+                return cpuset_mem_spread_node();
+        else if (current->mempolicy)
+                return slab_node(current->mempolicy);
+        return node;
+}
+/*
+ * Allocate an object from a remote node. Return NULL if none could be found
+ * (in which case, caller should allocate a new slab)
+ *
+ * Must be called with interrupts disabled.
+ */
+static void *__remote_slab_alloc_node(struct kmem_cache *s,
+                                gfp_t gfpflags, int node)
+{
+        struct kmem_cache_node *n;
+        struct kmem_cache_list *l;
+        void *object;
+        n = s->node_slab[node];
+        if (unlikely(!n)) /* node has no memory */
+                return NULL;
+        l = &n->list;
+        spin_lock(&n->list_lock);
+        object = __cache_list_get_object(s, l);
+        if (unlikely(!object)) {
+                object = cache_list_get_page(s, l);
+                if (unlikely(!object)) {
+                        spin_unlock(&n->list_lock);
+                        return __slab_alloc_page(s, gfpflags, node);
+                }
+        }
+        if (likely(object))
+                slqb_stat_inc(l, ALLOC);
+        spin_unlock(&n->list_lock);
+        return object;
+}
+static noinline void *__remote_slab_alloc(struct kmem_cache *s,
+                                gfp_t gfpflags, int node)
+{
+        void *object;
+        struct zonelist *zonelist;
+        struct zoneref *z;
+        struct zone *zone;
+        enum zone_type high_zoneidx = gfp_zone(gfpflags);
+        object = __remote_slab_alloc_node(s, gfpflags, node);
+        if (likely(object || (gfpflags & __GFP_THISNODE)))
+                return object;
+        zonelist = node_zonelist(slab_node(current->mempolicy), gfpflags);
+        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+                if (!cpuset_zone_allowed_hardwall(zone, gfpflags))
+                        continue;
+                node = zone_to_nid(zone);
+                object = __remote_slab_alloc_node(s, gfpflags, node);
+                if (likely(object))
+                        return object;
+        }
+        return NULL;
+}
+#endif
+/*
+ * Main allocation path. Return an object, or NULL on allocation failure.
+ *
+ * Must be called with interrupts disabled.
+ */
+static __always_inline void *__slab_alloc(struct kmem_cache *s,
+                                gfp_t gfpflags, int node)
+{
+        void *object;
+        struct kmem_cache_cpu *c;
+        struct kmem_cache_list *l;
+#ifdef CONFIG_NUMA
+        if (unlikely(node != -1) && unlikely(node != numa_node_id())) {
+try_remote:
+                return __remote_slab_alloc(s, gfpflags, node);
+        }
+#endif
+        c = get_cpu_slab(s, smp_processor_id());
+        VM_BUG_ON(!c);
+        l = &c->list;
+        object = __cache_list_get_object(s, l);
+        if (unlikely(!object)) {
+#ifdef CONFIG_NUMA
+                int thisnode = numa_node_id();
+                /*
+                 * If the local node is memoryless, try remote alloc before
+                 * trying the page allocator. Otherwise, what happens is
+                 * objects are always freed to remote lists but the allocation
+                 * side always allocates a new page with only one object
+                 * used in each page
+                 */
+                if (unlikely(!node_state(thisnode, N_HIGH_MEMORY)))
+                        object = __remote_slab_alloc(s, gfpflags, thisnode);
+#endif
+                if (!object) {
+                        object = cache_list_get_page(s, l);
+                        if (unlikely(!object)) {
+                                object = __slab_alloc_page(s, gfpflags, node);
+#ifdef CONFIG_NUMA
+                                if (unlikely(!object)) {
+                                        node = numa_node_id();
+                                        goto try_remote;
+                                }
+#endif
+                                return object;
+                        }
+                }
+        }
+        if (likely(object))
+                slqb_stat_inc(l, ALLOC);
+        return object;
+}
+/*
+ * Perform some interrupts-on processing around the main allocation path
+ * (debug checking and memset()ing).
+ */
+static __always_inline void *slab_alloc(struct kmem_cache *s,
+                                gfp_t gfpflags, int node, unsigned long addr)
+{
+        void *object;
+        unsigned long flags;
+        gfpflags &= gfp_allowed_mask;
+        lockdep_trace_alloc(gfpflags);
+        might_sleep_if(gfpflags & __GFP_WAIT);
+        if (should_failslab(s->objsize, gfpflags, s->flags))
+                return NULL;
+again:
+        local_irq_save(flags);
+        object = __slab_alloc(s, gfpflags, node);
+        local_irq_restore(flags);
+        if (unlikely(slab_debug(s)) && likely(object)) {
+                if (unlikely(!alloc_debug_processing(s, object, addr)))
+                        goto again;
+        }
+        if (unlikely(gfpflags & __GFP_ZERO) && likely(object))
+                memset(object, 0, s->objsize);
+        return object;
+}
+static __always_inline void *__kmem_cache_alloc(struct kmem_cache *s,
+                                gfp_t gfpflags, unsigned long caller)
+{
+        int node = -1;
+#ifdef CONFIG_NUMA
+        if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
+                node = alternate_nid(s, gfpflags, node);
+#endif
+        return slab_alloc(s, gfpflags, node, caller);
+}
+void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
+{
+        return __kmem_cache_alloc(s, gfpflags, _RET_IP_);
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+#ifdef CONFIG_NUMA
+void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
+{
+        return slab_alloc(s, gfpflags, node, _RET_IP_);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+#endif
+#ifdef CONFIG_SMP
+/*
+ * Flush this CPU's remote free list of objects back to the list from where
+ * they originate. They end up on that list's remotely freed list, and
+ * eventually we set it's remote_free_check if there are enough objects on it.
+ *
+ * This seems convoluted, but it keeps is from stomping on the target CPU's
+ * fastpath cachelines.
+ *
+ * Must be called with interrupts disabled.
+ */
+static void flush_remote_free_cache(struct kmem_cache *s,
+                                struct kmem_cache_cpu *c)
+{
+        struct kmlist *src;
+        struct kmem_cache_list *dst;
+        unsigned int nr;
+        int set;
+        src = &c->rlist;
+        nr = src->nr;
+        if (unlikely(!nr))
+                return;
+#ifdef CONFIG_SLQB_STATS
+        {
+                struct kmem_cache_list *l = &c->list;
+                slqb_stat_inc(l, FLUSH_RFREE_LIST);
+                slqb_stat_add(l, FLUSH_RFREE_LIST_OBJECTS, nr);
+        }
+#endif
+        dst = c->remote_cache_list;
+        /*
+         * Less common case, dst is filling up so free synchronously.
+         * No point in having remote CPU free thse as it will just
+         * free them back to the page list anyway.
+         */
+        if (unlikely(dst->remote_free.list.nr > (slab_hiwater(s) >> 1))) {
+                void **head;
+                head = src->head;
+                spin_lock(&dst->page_lock);
+                do {
+                        struct slqb_page *page;
+                        void **object;
+                        object = head;
+                        VM_BUG_ON(!object);
+                        head = get_freepointer(s, object);
+                        page = virt_to_head_slqb_page(object);
+                        free_object_to_page(s, dst, page, object);
+                        nr--;
+                } while (nr);
+                spin_unlock(&dst->page_lock);
+                src->head = NULL;
+                src->tail = NULL;
+                src->nr = 0;
+                return;
+        }
+        spin_lock(&dst->remote_free.lock);
+        if (!dst->remote_free.list.head)
+                dst->remote_free.list.head = src->head;
+        else
+                set_freepointer(s, dst->remote_free.list.tail, src->head);
+        dst->remote_free.list.tail = src->tail;
+        src->head = NULL;
+        src->tail = NULL;
+        src->nr = 0;
+        if (dst->remote_free.list.nr < slab_freebatch(s))
+                set = 1;
+        else
+                set = 0;
+        dst->remote_free.list.nr += nr;
+        if (unlikely(dst->remote_free.list.nr >= slab_freebatch(s) && set))
+                dst->remote_free_check = 1;
+        spin_unlock(&dst->remote_free.lock);
+}
+/*
+ * Free an object to this CPU's remote free list.
+ *
+ * Must be called with interrupts disabled.
+ */
+static noinline void slab_free_to_remote(struct kmem_cache *s,
+                                struct slqb_page *page, void *object,
+                                struct kmem_cache_cpu *c)
+{
+        struct kmlist *r;
+        /*
+         * Our remote free list corresponds to a different list. Must
+         * flush it and switch.
+         */
+        if (page->list != c->remote_cache_list) {
+                flush_remote_free_cache(s, c);
+                c->remote_cache_list = page->list;
+        }
+        r = &c->rlist;
+        if (!r->head)
+                r->head = object;
+        else
+                set_freepointer(s, r->tail, object);
+        set_freepointer(s, object, NULL);
+        r->tail = object;
+        r->nr++;
+        if (unlikely(r->nr >= slab_freebatch(s)))
+                flush_remote_free_cache(s, c);
+}
+#endif
+/*
+ * Main freeing path. Return an object, or NULL on allocation failure.
+ *
+ * Must be called with interrupts disabled.
+ */
+static __always_inline void __slab_free(struct kmem_cache *s,
+                                struct slqb_page *page, void *object)
+{
+        struct kmem_cache_cpu *c;
+        struct kmem_cache_list *l;
+        int thiscpu = smp_processor_id();
+        c = get_cpu_slab(s, thiscpu);
+        l = &c->list;
+        slqb_stat_inc(l, FREE);
+        if (!NUMA_BUILD || !slab_numa(s) ||
+                        likely(slqb_page_to_nid(page) == numa_node_id())) {
+                /*
+                 * Freeing fastpath. Collects all local-node objects, not
+                 * just those allocated from our per-CPU list. This allows
+                 * fast transfer of objects from one CPU to another within
+                 * a given node.
+                 */
+                set_freepointer(s, object, l->freelist.head);
+                l->freelist.head = object;
+                if (!l->freelist.nr)
+                        l->freelist.tail = object;
+                l->freelist.nr++;
+                if (unlikely(l->freelist.nr > slab_hiwater(s)))
+                        flush_free_list(s, l);
+        } else {
+#ifdef CONFIG_SMP
+                /*
+                 * Freeing an object that was allocated on a remote node.
+                 */
+                slab_free_to_remote(s, page, object, c);
+                slqb_stat_inc(l, FREE_REMOTE);
+#endif
+        }
+}
+/*
+ * Perform some interrupts-on processing around the main freeing path
+ * (debug checking).
+ */
+static __always_inline void slab_free(struct kmem_cache *s,
+                                struct slqb_page *page, void *object)
+{
+        unsigned long flags;
+        prefetchw(object);
+        debug_check_no_locks_freed(object, s->objsize);
+        if (likely(object) && unlikely(slab_debug(s))) {
+                if (unlikely(!free_debug_processing(s, object, _RET_IP_)))
+                        return;
+        }
+        local_irq_save(flags);
+        __slab_free(s, page, object);
+        local_irq_restore(flags);
+}
+void kmem_cache_free(struct kmem_cache *s, void *object)
+{
+        struct slqb_page *page = NULL;
+        if (slab_numa(s))
+                page = virt_to_head_slqb_page(object);
+        slab_free(s, page, object);
+}
+EXPORT_SYMBOL(kmem_cache_free);
+/*
+ * Calculate the order of allocation given an slab object size.
+ *
+ * Order 0 allocations are preferred since order 0 does not cause fragmentation
+ * in the page allocator, and they have fastpaths in the page allocator. But
+ * also minimise external fragmentation with large objects.
+ */
+static int slab_order(int size, int max_order, int frac)
+{
+        int order;
+        if (fls(size - 1) <= PAGE_SHIFT)
+                order = 0;
+        else
+                order = fls(size - 1) - PAGE_SHIFT;
+        if (order < slqb_min_order)
+                order = slqb_min_order;
+        while (order <= max_order) {
+                unsigned long slab_size = PAGE_SIZE << order;
+                unsigned long objects;
+                unsigned long waste;
+                objects = slab_size / size;
+                if (!objects)
+                        goto next;
+                if (order < MAX_ORDER && objects < slqb_min_objects) {
+                        /*
+                         * if we don't have enough objects for min_objects,
+                         * then try the next size up. Unless we have reached
+                         * our maximum possible page size.
+                         */
+                        goto next;
+                }
+                waste = slab_size - (objects * size);
+                if (waste * frac <= slab_size)
+                        break;
+next:
+                order++;
+        }
+        return order;
+}
+static int calculate_order(int size)
+{
+        int order;
+        /*
+         * Attempt to find best configuration for a slab. This
+         * works by first attempting to generate a layout with
+         * the best configuration and backing off gradually.
+         */
+        order = slab_order(size, 1, 4);
+        if (order <= 1)
+                return order;
+        /*
+         * This size cannot fit in order-1. Allow bigger orders, but
+         * forget about trying to save space.
+         */
+        order = slab_order(size, MAX_ORDER - 1, 0);
+        if (order < MAX_ORDER)
+                return order;
+        return -ENOSYS;
+}
+/*
+ * Figure out what the alignment of the objects will be.
+ */
+static unsigned long calculate_alignment(unsigned long flags,
+                                unsigned long align, unsigned long size)
+{
+        /*
+         * If the user wants hardware cache aligned objects then follow that
+         * suggestion if the object is sufficiently large.
+         *
+         * The hardware cache alignment cannot override the specified
+         * alignment though. If that is greater then use it.
+         */
+        if (flags & SLAB_HWCACHE_ALIGN) {
+                unsigned long ralign = cache_line_size();
+                while (size <= ralign / 2)
+                        ralign /= 2;
+                align = max(align, ralign);
+        }
+        if (align < ARCH_SLAB_MINALIGN)
+                align = ARCH_SLAB_MINALIGN;
+        return ALIGN(align, sizeof(void *));
+}
+static void init_kmem_cache_list(struct kmem_cache *s,
+                                struct kmem_cache_list *l)
+{
+        l->cache                = s;
+        l->freelist.nr          = 0;
+        l->freelist.head        = NULL;
+        l->freelist.tail        = NULL;
+        l->nr_partial           = 0;
+        l->nr_slabs             = 0;
+        INIT_LIST_HEAD(&l->partial);
+        spin_lock_init(&l->page_lock);
+#ifdef CONFIG_SMP
+        l->remote_free_check    = 0;
+        spin_lock_init(&l->remote_free.lock);
+        l->remote_free.list.nr  = 0;
+        l->remote_free.list.head = NULL;
+        l->remote_free.list.tail = NULL;
+#endif
+#ifdef CONFIG_SLQB_STATS
+        memset(l->stats, 0, sizeof(l->stats));
+#endif
+}
+static void init_kmem_cache_cpu(struct kmem_cache *s,
+                                struct kmem_cache_cpu *c)
+{
+        init_kmem_cache_list(s, &c->list);
+        c->colour_next          = 0;
+#ifdef CONFIG_SMP
+        c->rlist.nr             = 0;
+        c->rlist.head           = NULL;
+        c->rlist.tail           = NULL;
+        c->remote_cache_list    = NULL;
+#endif
+}
+#ifdef CONFIG_NUMA
+static void init_kmem_cache_node(struct kmem_cache *s,
+                                struct kmem_cache_node *n)
+{
+        spin_lock_init(&n->list_lock);
+        init_kmem_cache_list(s, &n->list);
+}
+#endif
+/* Initial slabs. */
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cache_cpus);
+#endif
+#ifdef CONFIG_NUMA
+/* XXX: really need a DEFINE_PER_NODE for per-node data because a static
+ *      array is wasteful */
+static struct kmem_cache_node kmem_cache_nodes[MAX_NUMNODES];
+#endif
+#ifdef CONFIG_SMP
+static struct kmem_cache kmem_cpu_cache;
+static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cpu_cpus);
+#ifdef CONFIG_NUMA
+static struct kmem_cache_node kmem_cpu_nodes[MAX_NUMNODES]; /* XXX per-nid */
+#endif
+#endif
+#ifdef CONFIG_NUMA
+static struct kmem_cache kmem_node_cache;
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_node_cpus);
+#endif
+static struct kmem_cache_node kmem_node_nodes[MAX_NUMNODES]; /*XXX per-nid */
+#endif
+#ifdef CONFIG_SMP
+static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
+                                int cpu)
+{
+        struct kmem_cache_cpu *c;
+        int node;
+        node = cpu_to_node(cpu);
+        c = kmem_cache_alloc_node(&kmem_cpu_cache, GFP_KERNEL, node);
+        if (!c)
+                return NULL;
+        init_kmem_cache_cpu(s, c);
+        return c;
+}
+static void free_kmem_cache_cpus(struct kmem_cache *s)
+{
+        int cpu;
+        for_each_online_cpu(cpu) {
+                struct kmem_cache_cpu *c;
+                c = s->cpu_slab[cpu];
+                if (c) {
+                        kmem_cache_free(&kmem_cpu_cache, c);
+                        s->cpu_slab[cpu] = NULL;
+                }
+        }
+}
+static int alloc_kmem_cache_cpus(struct kmem_cache *s)
+{
+        int cpu;
+        for_each_online_cpu(cpu) {
+                struct kmem_cache_cpu *c;
+                c = s->cpu_slab[cpu];
+                if (c)
+                        continue;
+                c = alloc_kmem_cache_cpu(s, cpu);
+                if (!c) {
+                        free_kmem_cache_cpus(s);
+                        return 0;
+                }
+                s->cpu_slab[cpu] = c;
+        }
+        return 1;
+}
+#else
+static inline void free_kmem_cache_cpus(struct kmem_cache *s)
+{
+}
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
+{
+        init_kmem_cache_cpu(s, &s->cpu_slab);
+        return 1;
+}
+#endif
+#ifdef CONFIG_NUMA
+static void free_kmem_cache_nodes(struct kmem_cache *s)
+{
+        int node;
+        for_each_node_state(node, N_NORMAL_MEMORY) {
+                struct kmem_cache_node *n;
+                n = s->node_slab[node];
+                if (n) {
+                        kmem_cache_free(&kmem_node_cache, n);
+                        s->node_slab[node] = NULL;
+                }
+        }
+}
+static int alloc_kmem_cache_nodes(struct kmem_cache *s)
+{
+        int node;
+        for_each_node_state(node, N_NORMAL_MEMORY) {
+                struct kmem_cache_node *n;
+                n = kmem_cache_alloc_node(&kmem_node_cache, GFP_KERNEL, node);
+                if (!n) {
+                        free_kmem_cache_nodes(s);
+                        return 0;
+                }
+                init_kmem_cache_node(s, n);
+                s->node_slab[node] = n;
+        }
+        return 1;
+}
+#else
+static void free_kmem_cache_nodes(struct kmem_cache *s)
+{
+}
+static int alloc_kmem_cache_nodes(struct kmem_cache *s)
+{
+        return 1;
+}
+#endif
+/*
+ * calculate_sizes() determines the order and the distribution of data within
+ * a slab object.
+ */
+static int calculate_sizes(struct kmem_cache *s)
+{
+        unsigned long flags = s->flags;
+        unsigned long size = s->objsize;
+        unsigned long align = s->align;
+        /*
+         * Determine if we can poison the object itself. If the user of
+         * the slab may touch the object after free or before allocation
+         * then we should never poison the object itself.
+         */
+        if (slab_poison(s) && !(flags & SLAB_DESTROY_BY_RCU) && !s->ctor)
+                s->flags |= __OBJECT_POISON;
+        else
+                s->flags &= ~__OBJECT_POISON;
+        /*
+         * Round up object size to the next word boundary. We can only
+         * place the free pointer at word boundaries and this determines
+         * the possible location of the free pointer.
+         */
+        size = ALIGN(size, sizeof(void *));
+#ifdef CONFIG_SLQB_DEBUG
+        /*
+         * If we are Redzoning then check if there is some space between the
+         * end of the object and the free pointer. If not then add an
+         * additional word to have some bytes to store Redzone information.
+         */
+        if ((flags & SLAB_RED_ZONE) && size == s->objsize)
+                size += sizeof(void *);
+#endif
+        /*
+         * With that we have determined the number of bytes in actual use
+         * by the object. This is the potential offset to the free pointer.
+         */
+        s->inuse = size;
+        if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || s->ctor)) {
+                /*
+                 * Relocate free pointer after the object if it is not
+                 * permitted to overwrite the first word of the object on
+                 * kmem_cache_free.
+                 *
+                 * This is the case if we do RCU, have a constructor or
+                 * destructor or are poisoning the objects.
+                 */
+                s->offset = size;
+                size += sizeof(void *);
+        }
+#ifdef CONFIG_SLQB_DEBUG
+        if (flags & SLAB_STORE_USER) {
+                /*
+                 * Need to store information about allocs and frees after
+                 * the object.
+                 */
+                size += 2 * sizeof(struct track);
+        }
+        if (flags & SLAB_RED_ZONE) {
+                /*
+                 * Add some empty padding so that we can catch
+                 * overwrites from earlier objects rather than let
+                 * tracking information or the free pointer be
+                 * corrupted if an user writes before the start
+                 * of the object.
+                 */
+                size += sizeof(void *);
+        }
+#endif
+        /*
+         * Determine the alignment based on various parameters that the
+         * user specified and the dynamic determination of cache line size
+         * on bootup.
+         */
+        align = calculate_alignment(flags, align, s->objsize);
+        /*
+         * SLQB stores one object immediately after another beginning from
+         * offset 0. In order to align the objects we have to simply size
+         * each object to conform to the alignment.
+         */
+        size = ALIGN(size, align);
+        s->size = size;
+        s->order = calculate_order(size);
+        if (s->order < 0)
+                return 0;
+        s->allocflags = 0;
+        if (s->order)
+                s->allocflags |= __GFP_COMP;
+        if (s->flags & SLAB_CACHE_DMA)
+                s->allocflags |= SLQB_DMA;
+        if (s->flags & SLAB_RECLAIM_ACCOUNT)
+                s->allocflags |= __GFP_RECLAIMABLE;
+        /*
+         * Determine the number of objects per slab
+         */
+        s->objects = (PAGE_SIZE << s->order) / size;
+        s->freebatch = max(4UL*PAGE_SIZE / size,
+                                min(256UL, 64*PAGE_SIZE / size));
+        if (!s->freebatch)
+                s->freebatch = 1;
+        s->hiwater = s->freebatch << 2;
+        return !!s->objects;
+}
+#ifdef CONFIG_SMP
+/*
+ * Per-cpu allocator can't be used because it always uses slab allocator,
+ * and it can't do per-node allocations.
+ */
+static void *kmem_cache_dyn_array_alloc(int ids)
+{
+        size_t size = sizeof(void *) * ids;
+        BUG_ON(!size);
+        if (unlikely(!slab_is_available())) {
+                static void *nextmem;
+                static size_t nextleft;
+                void *ret;
+                /*
+                 * Special case for setting up initial caches. These will
+                 * never get freed by definition so we can do it rather
+                 * simply.
+                 */
+                if (size > nextleft) {
+                        nextmem = alloc_pages_exact(size, GFP_KERNEL);
+                        if (!nextmem)
+                                return NULL;
+                        nextleft = roundup(size, PAGE_SIZE);
+                }
+                ret = nextmem;
+                nextleft -= size;
+                nextmem += size;
+                memset(ret, 0, size);
+                return ret;
+        } else {
+                return kzalloc(size, GFP_KERNEL);
+        }
+}
+static void kmem_cache_dyn_array_free(void *array)
+{
+        if (unlikely(!slab_is_available()))
+                return; /* error case without crashing here (will panic soon) */
+        kfree(array);
+}
+#endif
+/*
+ * Except in early boot, this should be called with slqb_lock held for write
+ * to lock out hotplug, and protect list modifications.
+ */
+static int kmem_cache_open(struct kmem_cache *s,
+                        const char *name, size_t size, size_t align,
+                        unsigned long flags, void (*ctor)(void *), int alloc)
+{
+        unsigned int left_over;
+        memset(s, 0, sizeof(struct kmem_cache));
+        s->name = name;
+        s->ctor = ctor;
+        s->objsize = size;
+        s->align = align;
+        s->flags = kmem_cache_flags(size, flags, name, ctor);
+        if (!calculate_sizes(s))
+                goto error;
+        if (!slab_debug(s)) {
+                left_over = (PAGE_SIZE << s->order) - (s->objects * s->size);
+                s->colour_off = max(cache_line_size(), s->align);
+                s->colour_range = left_over;
+        } else {
+                s->colour_off = 0;
+                s->colour_range = 0;
+        }
+#ifdef CONFIG_SMP
+        s->cpu_slab = kmem_cache_dyn_array_alloc(nr_cpu_ids);
+        if (!s->cpu_slab)
+                goto error;
+# ifdef CONFIG_NUMA
+        s->node_slab = kmem_cache_dyn_array_alloc(nr_node_ids);
+        if (!s->node_slab)
+                goto error_cpu_array;
+# endif
+#endif
+        if (likely(alloc)) {
+                if (!alloc_kmem_cache_nodes(s))
+                        goto error_node_array;
+                if (!alloc_kmem_cache_cpus(s))
+                        goto error_nodes;
+        }
+        sysfs_slab_add(s);
+        list_add(&s->list, &slab_caches);
+        return 1;
+error_nodes:
+        free_kmem_cache_nodes(s);
+error_node_array:
+#if defined(CONFIG_NUMA) && defined(CONFIG_SMP)
+        kmem_cache_dyn_array_free(s->node_slab);
+error_cpu_array:
+#endif
+#ifdef CONFIG_SMP
+        kmem_cache_dyn_array_free(s->cpu_slab);
+#endif
+error:
+        if (flags & SLAB_PANIC)
+                panic("%s: failed to create slab `%s'\n", __func__, name);
+        return 0;
+}
+/**
+ * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
+ * @s: the cache we're checking against
+ * @ptr: pointer to validate
+ *
+ * This verifies that the untrusted pointer looks sane;
+ * it is _not_ a guarantee that the pointer is actually
+ * part of the slab cache in question, but it at least
+ * validates that the pointer can be dereferenced and
+ * looks half-way sane.
+ *
+ * Currently only used for dentry validation.
+ */
+int kmem_ptr_validate(struct kmem_cache *s, const void *ptr)
+{
+        unsigned long addr = (unsigned long)ptr;
+        struct slqb_page *page;
+        if (unlikely(addr < PAGE_OFFSET))
+                goto out;
+        if (unlikely(addr > (unsigned long)high_memory - s->size))
+                goto out;
+        if (unlikely(!IS_ALIGNED(addr, s->align)))
+                goto out;
+        if (unlikely(!kern_addr_valid(addr)))
+                goto out;
+        if (unlikely(!kern_addr_valid(addr + s->size - 1)))
+                goto out;
+        if (unlikely(!pfn_valid(addr >> PAGE_SHIFT)))
+                goto out;
+        page = virt_to_head_slqb_page(ptr);
+        if (unlikely(!(page->flags & PG_SLQB_BIT)))
+                goto out;
+        if (unlikely(page->list->cache != s)) /* XXX: ouch, racy */
+                goto out;
+        return 1;
+out:
+        return 0;
+}
+EXPORT_SYMBOL(kmem_ptr_validate);
+/*
+ * Determine the size of a slab object
+ */
+unsigned int kmem_cache_size(struct kmem_cache *s)
+{
+        return s->objsize;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+const char *kmem_cache_name(struct kmem_cache *s)
+{
+        return s->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+/*
+ * Release all resources used by a slab cache. No more concurrency on the
+ * slab, so we can touch remote kmem_cache_cpu structures.
+ */
+void kmem_cache_destroy(struct kmem_cache *s)
+{
+#ifdef CONFIG_NUMA
+        int node;
+#endif
+        int cpu;
+        down_write(&slqb_lock);
+        list_del(&s->list);
+        local_irq_disable();
+#ifdef CONFIG_SMP
+        for_each_online_cpu(cpu) {
+                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+                struct kmem_cache_list *l = &c->list;
+                flush_free_list_all(s, l);
+                flush_remote_free_cache(s, c);
+        }
+#endif
+        for_each_online_cpu(cpu) {
+                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+                struct kmem_cache_list *l = &c->list;
+                claim_remote_free_list(s, l);
+                flush_free_list_all(s, l);
+                WARN_ON(l->freelist.nr);
+                WARN_ON(l->nr_slabs);
+                WARN_ON(l->nr_partial);
+        }
+        free_kmem_cache_cpus(s);
+#ifdef CONFIG_NUMA
+        for_each_node_state(node, N_NORMAL_MEMORY) {
+                struct kmem_cache_node *n;
+                struct kmem_cache_list *l;
+                n = s->node_slab[node];
+                if (!n)
+                        continue;
+                l = &n->list;
+                claim_remote_free_list(s, l);
+                flush_free_list_all(s, l);
+                WARN_ON(l->freelist.nr);
+                WARN_ON(l->nr_slabs);
+                WARN_ON(l->nr_partial);
+        }
+        free_kmem_cache_nodes(s);
+#endif
+        local_irq_enable();
+        sysfs_slab_remove(s);
+        up_write(&slqb_lock);
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+/********************************************************************
+ *              Kmalloc subsystem
+ *******************************************************************/
+struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned;
+EXPORT_SYMBOL(kmalloc_caches);
+#ifdef CONFIG_ZONE_DMA
+struct kmem_cache kmalloc_caches_dma[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned;
+EXPORT_SYMBOL(kmalloc_caches_dma);
+#endif
+#ifndef ARCH_KMALLOC_FLAGS
+#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
+#endif
+static struct kmem_cache *open_kmalloc_cache(struct kmem_cache *s,
+                                const char *name, int size, gfp_t gfp_flags)
+{
+        unsigned int flags = ARCH_KMALLOC_FLAGS | SLAB_PANIC;
+        if (gfp_flags & SLQB_DMA)
+                flags |= SLAB_CACHE_DMA;
+        kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL, 1);
+        return s;
+}
+/*
+ * Conversion table for small slabs sizes / 8 to the index in the
+ * kmalloc array. This is necessary for slabs < 192 since we have non power
+ * of two cache sizes there. The size of larger slabs can be determined using
+ * fls.
+ */
+static s8 size_index[24] __cacheline_aligned = {
+        3,      /* 8 */
+        4,      /* 16 */
+        5,      /* 24 */
+        5,      /* 32 */
+        6,      /* 40 */
+        6,      /* 48 */
+        6,      /* 56 */
+        6,      /* 64 */
+#if L1_CACHE_BYTES < 64
+        1,      /* 72 */
+        1,      /* 80 */
+        1,      /* 88 */
+        1,      /* 96 */
+#else
+        7,
+        7,
+        7,
+        7,
+#endif
+        7,      /* 104 */
+        7,      /* 112 */
+        7,      /* 120 */
+        7,      /* 128 */
+#if L1_CACHE_BYTES < 128
+        2,      /* 136 */
+        2,      /* 144 */
+        2,      /* 152 */
+        2,      /* 160 */
+        2,      /* 168 */
+        2,      /* 176 */
+        2,      /* 184 */
+        2       /* 192 */
+#else
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1
+#endif
+};
+static struct kmem_cache *get_slab(size_t size, gfp_t flags)
+{
+        int index;
+        if (unlikely(size <= KMALLOC_MIN_SIZE)) {
+                if (unlikely(!size))
+                        return ZERO_SIZE_PTR;
+                index = KMALLOC_SHIFT_LOW;
+                goto got_index;
+        }
+#if L1_CACHE_BYTES >= 128
+        if (size <= 128) {
+#else
+        if (size <= 192) {
+#endif
+                index = size_index[(size - 1) / 8];
+        } else {
+                if (unlikely(size > 1UL << KMALLOC_SHIFT_SLQB_HIGH))
+                        return NULL;
+                index = fls(size - 1);
+        }
+got_index:
+        if (unlikely((flags & SLQB_DMA)))
+                return &kmalloc_caches_dma[index];
+        else
+                return &kmalloc_caches[index];
+}
+void *__kmalloc(size_t size, gfp_t flags)
+{
+        struct kmem_cache *s;
+        s = get_slab(size, flags);
+        if (unlikely(ZERO_OR_NULL_PTR(s)))
+                return s;
+        return __kmem_cache_alloc(s, flags, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc);
+#ifdef CONFIG_NUMA
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+        struct kmem_cache *s;
+        s = get_slab(size, flags);
+        if (unlikely(ZERO_OR_NULL_PTR(s)))
+                return s;
+        return kmem_cache_alloc_node(s, flags, node);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+#endif
+size_t ksize(const void *object)
+{
+        struct slqb_page *page;
+        struct kmem_cache *s;
+        BUG_ON(!object);
+        if (unlikely(object == ZERO_SIZE_PTR))
+                return 0;
+        page = virt_to_head_slqb_page(object);
+        BUG_ON(!(page->flags & PG_SLQB_BIT));
+        s = page->list->cache;
+        /*
+         * Debugging requires use of the padding between object
+         * and whatever may come after it.
+         */
+        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+                return s->objsize;
+        /*
+         * If we have the need to store the freelist pointer
+         * back there or track user information then we can
+         * only use the space before that information.
+         */
+        if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+                return s->inuse;
+        /*
+         * Else we can use all the padding etc for the allocation
+         */
+        return s->size;
+}
+EXPORT_SYMBOL(ksize);
+void kfree(const void *object)
+{
+        struct kmem_cache *s;
+        struct slqb_page *page;
+        if (unlikely(ZERO_OR_NULL_PTR(object)))
+                return;
+        page = virt_to_head_slqb_page(object);
+        s = page->list->cache;
+        slab_free(s, page, (void *)object);
+}
+EXPORT_SYMBOL(kfree);
+static void kmem_cache_trim_percpu(void *arg)
+{
+        int cpu = smp_processor_id();
+        struct kmem_cache *s = arg;
+        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+        struct kmem_cache_list *l = &c->list;
+        claim_remote_free_list(s, l);
+        flush_free_list(s, l);
+#ifdef CONFIG_SMP
+        flush_remote_free_cache(s, c);
+#endif
+}
+int kmem_cache_shrink(struct kmem_cache *s)
+{
+#ifdef CONFIG_NUMA
+        int node;
+#endif
+        on_each_cpu(kmem_cache_trim_percpu, s, 1);
+#ifdef CONFIG_NUMA
+        for_each_node_state(node, N_NORMAL_MEMORY) {
+                struct kmem_cache_node *n;
+                struct kmem_cache_list *l;
+                n = s->node_slab[node];
+                if (!n)
+                        continue;
+                l = &n->list;
+                spin_lock_irq(&n->list_lock);
+                claim_remote_free_list(s, l);
+                flush_free_list(s, l);
+                spin_unlock_irq(&n->list_lock);
+        }
+#endif
+        return 0;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+static void kmem_cache_reap_percpu(void *arg)
+{
+        int cpu = smp_processor_id();
+        struct kmem_cache *s;
+        long phase = (long)arg;
+        list_for_each_entry(s, &slab_caches, list) {
+                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+                struct kmem_cache_list *l = &c->list;
+                if (phase == 0) {
+                        flush_free_list_all(s, l);
+                        flush_remote_free_cache(s, c);
+                }
+                if (phase == 1) {
+                        claim_remote_free_list(s, l);
+                        flush_free_list_all(s, l);
+                }
+        }
+}
+static void kmem_cache_reap(void)
+{
+        struct kmem_cache *s;
+        int node;
+        down_read(&slqb_lock);
+        on_each_cpu(kmem_cache_reap_percpu, (void *)0, 1);
+        on_each_cpu(kmem_cache_reap_percpu, (void *)1, 1);
+        list_for_each_entry(s, &slab_caches, list) {
+                for_each_node_state(node, N_NORMAL_MEMORY) {
+                        struct kmem_cache_node *n;
+                        struct kmem_cache_list *l;
+                        n = s->node_slab[node];
+                        if (!n)
+                                continue;
+                        l = &n->list;
+                        spin_lock_irq(&n->list_lock);
+                        claim_remote_free_list(s, l);
+                        flush_free_list_all(s, l);
+                        spin_unlock_irq(&n->list_lock);
+                }
+        }
+        up_read(&slqb_lock);
+}
+#endif
+static void cache_trim_worker(struct work_struct *w)
+{
+        struct delayed_work *work =
+                container_of(w, struct delayed_work, work);
+        struct kmem_cache *s;
+        if (!down_read_trylock(&slqb_lock))
+                goto out;
+        list_for_each_entry(s, &slab_caches, list) {
+#ifdef CONFIG_NUMA
+                int node = numa_node_id();
+                struct kmem_cache_node *n = s->node_slab[node];
+                if (n) {
+                        struct kmem_cache_list *l = &n->list;
+                        spin_lock_irq(&n->list_lock);
+                        claim_remote_free_list(s, l);
+                        flush_free_list(s, l);
+                        spin_unlock_irq(&n->list_lock);
+                }
+#endif
+                local_irq_disable();
+                kmem_cache_trim_percpu(s);
+                local_irq_enable();
+        }
+        up_read(&slqb_lock);
+out:
+        schedule_delayed_work(work, round_jiffies_relative(3*HZ));
+}
+static DEFINE_PER_CPU(struct delayed_work, slqb_cache_trim_work);
+static void __cpuinit start_cpu_timer(int cpu)
+{
+        struct delayed_work *cache_trim_work = &per_cpu(slqb_cache_trim_work,
+                        cpu);
+        /*
+         * When this gets called from do_initcalls via cpucache_init(),
+         * init_workqueues() has already run, so keventd will be setup
+         * at that time.
+         */
+        if (keventd_up() && cache_trim_work->work.func == NULL) {
+                INIT_DELAYED_WORK(cache_trim_work, cache_trim_worker);
+                schedule_delayed_work_on(cpu, cache_trim_work,
+                                        __round_jiffies_relative(HZ, cpu));
+        }
+}
+static int __init cpucache_init(void)
+{
+        int cpu;
+        for_each_online_cpu(cpu)
+                start_cpu_timer(cpu);
+        return 0;
+}
+device_initcall(cpucache_init);
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+static void slab_mem_going_offline_callback(void *arg)
+{
+        kmem_cache_reap();
+}
+static void slab_mem_offline_callback(void *arg)
+{
+        /* XXX: should release structures, see CPU offline comment */
+}
+static int slab_mem_going_online_callback(void *arg)
+{
+        struct kmem_cache *s;
+        struct kmem_cache_node *n;
+        struct memory_notify *marg = arg;
+        int nid = marg->status_change_nid;
+        int ret = 0;
+        /*
+         * If the node's memory is already available, then kmem_cache_node is
+         * already created. Nothing to do.
+         */
+        if (nid < 0)
+                return 0;
+        /*
+         * We are bringing a node online. No memory is availabe yet. We must
+         * allocate a kmem_cache_node structure in order to bring the node
+         * online.
+         */
+        down_write(&slqb_lock);
+        list_for_each_entry(s, &slab_caches, list) {
+                /*
+                 * XXX: kmem_cache_alloc_node will fallback to other nodes
+                 *      since memory is not yet available from the node that
+                 *      is brought up.
+                 */
+                if (s->node_slab[nid]) /* could be lefover from last online */
+                        continue;
+                n = kmem_cache_alloc(&kmem_node_cache, GFP_KERNEL);
+                if (!n) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                init_kmem_cache_node(s, n);
+                s->node_slab[nid] = n;
+        }
+out:
+        up_write(&slqb_lock);
+        return ret;
+}
+static int slab_memory_callback(struct notifier_block *self,
+                                unsigned long action, void *arg)
+{
+        int ret = 0;
+        switch (action) {
+        case MEM_GOING_ONLINE:
+                ret = slab_mem_going_online_callback(arg);
+                break;
+        case MEM_GOING_OFFLINE:
+                slab_mem_going_offline_callback(arg);
+                break;
+        case MEM_OFFLINE:
+        case MEM_CANCEL_ONLINE:
+                slab_mem_offline_callback(arg);
+                break;
+        case MEM_ONLINE:
+        case MEM_CANCEL_OFFLINE:
+                break;
+        }
+        if (ret)
+                ret = notifier_from_errno(ret);
+        else
+                ret = NOTIFY_OK;
+        return ret;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+/********************************************************************
+ *                      Basic setup of slabs
+ *******************************************************************/
+void __init kmem_cache_init(void)
+{
+        int i;
+        unsigned int flags = SLAB_HWCACHE_ALIGN|SLAB_PANIC;
+        /*
+         * All the ifdefs are rather ugly here, but it's just the setup code,
+         * so it doesn't have to be too readable :)
+         */
+        /*
+         * No need to take slqb_lock here: there should be no concurrency
+         * anyway, and spin_unlock_irq in rwsem code could enable interrupts
+         * too early.
+         */
+        kmem_cache_open(&kmem_cache_cache, "kmem_cache",
+                        sizeof(struct kmem_cache), 0, flags, NULL, 0);
+#ifdef CONFIG_SMP
+        kmem_cache_open(&kmem_cpu_cache, "kmem_cache_cpu",
+                        sizeof(struct kmem_cache_cpu), 0, flags, NULL, 0);
+#endif
+#ifdef CONFIG_NUMA
+        kmem_cache_open(&kmem_node_cache, "kmem_cache_node",
+                        sizeof(struct kmem_cache_node), 0, flags, NULL, 0);
+#endif
+#ifdef CONFIG_SMP
+        for_each_possible_cpu(i) {
+                struct kmem_cache_cpu *c;
+                c = &per_cpu(kmem_cache_cpus, i);
+                init_kmem_cache_cpu(&kmem_cache_cache, c);
+                kmem_cache_cache.cpu_slab[i] = c;
+                c = &per_cpu(kmem_cpu_cpus, i);
+                init_kmem_cache_cpu(&kmem_cpu_cache, c);
+                kmem_cpu_cache.cpu_slab[i] = c;
+#ifdef CONFIG_NUMA
+                c = &per_cpu(kmem_node_cpus, i);
+                init_kmem_cache_cpu(&kmem_node_cache, c);
+                kmem_node_cache.cpu_slab[i] = c;
+#endif
+        }
+#else
+        init_kmem_cache_cpu(&kmem_cache_cache, &kmem_cache_cache.cpu_slab);
+#endif
+#ifdef CONFIG_NUMA
+        for_each_node_state(i, N_NORMAL_MEMORY) {
+                struct kmem_cache_node *n;
+                n = &kmem_cache_nodes[i];
+                init_kmem_cache_node(&kmem_cache_cache, n);
+                kmem_cache_cache.node_slab[i] = n;
+#ifdef CONFIG_SMP
+                n = &kmem_cpu_nodes[i];
+                init_kmem_cache_node(&kmem_cpu_cache, n);
+                kmem_cpu_cache.node_slab[i] = n;
+#endif
+                n = &kmem_node_nodes[i];
+                init_kmem_cache_node(&kmem_node_cache, n);
+                kmem_node_cache.node_slab[i] = n;
+        }
+#endif
+        /* Caches that are not of the two-to-the-power-of size */
+        if (L1_CACHE_BYTES < 64 && KMALLOC_MIN_SIZE <= 64) {
+                open_kmalloc_cache(&kmalloc_caches[1],
+                                "kmalloc-96", 96, GFP_KERNEL);
+#ifdef CONFIG_ZONE_DMA
+                open_kmalloc_cache(&kmalloc_caches_dma[1],
+                                "kmalloc_dma-96", 96, GFP_KERNEL|SLQB_DMA);
+#endif
+        }
+        if (L1_CACHE_BYTES < 128 && KMALLOC_MIN_SIZE <= 128) {
+                open_kmalloc_cache(&kmalloc_caches[2],
+                                "kmalloc-192", 192, GFP_KERNEL);
+#ifdef CONFIG_ZONE_DMA
+                open_kmalloc_cache(&kmalloc_caches_dma[2],
+                                "kmalloc_dma-192", 192, GFP_KERNEL|SLQB_DMA);
+#endif
+        }
+        for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) {
+                open_kmalloc_cache(&kmalloc_caches[i],
+                                "kmalloc", 1 << i, GFP_KERNEL);
+#ifdef CONFIG_ZONE_DMA
+                open_kmalloc_cache(&kmalloc_caches_dma[i],
+                                "kmalloc_dma", 1 << i, GFP_KERNEL|SLQB_DMA);
+#endif
+        }
+        /*
+         * Patch up the size_index table if we have strange large alignment
+         * requirements for the kmalloc array. This is only the case for
+         * mips it seems. The standard arches will not generate any code here.
+         *
+         * Largest permitted alignment is 256 bytes due to the way we
+         * handle the index determination for the smaller caches.
+         *
+         * Make sure that nothing crazy happens if someone starts tinkering
+         * around with ARCH_KMALLOC_MINALIGN
+         */
+        BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
+                (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
+        for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
+                size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
+        /* Provide the correct kmalloc names now that the caches are up */
+        for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) {
+                kmalloc_caches[i].name =
+                        kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
+#ifdef CONFIG_ZONE_DMA
+                kmalloc_caches_dma[i].name =
+                        kasprintf(GFP_KERNEL, "kmalloc_dma-%d", 1 << i);
+#endif
+        }
+#ifdef CONFIG_SMP
+        register_cpu_notifier(&slab_notifier);
+#endif
+#ifdef CONFIG_NUMA
+        hotplug_memory_notifier(slab_memory_callback, 1);
+#endif
+        /*
+         * smp_init() has not yet been called, so no worries about memory
+         * ordering with __slab_is_available.
+         */
+        __slab_is_available = 1;
+}
+void __init kmem_cache_init_late(void)
+{
+}
+/*
+ * Some basic slab creation sanity checks
+ */
+static int kmem_cache_create_ok(const char *name, size_t size,
+                size_t align, unsigned long flags)
+{
+        struct kmem_cache *tmp;
+        /*
+         * Sanity checks... these are all serious usage bugs.
+         */
+        if (!name || in_interrupt() || (size < sizeof(void *))) {
+                printk(KERN_ERR "kmem_cache_create(): early error in slab %s\n",
+                                name);
+                dump_stack();
+                return 0;
+        }
+        list_for_each_entry(tmp, &slab_caches, list) {
+                char x;
+                int res;
+                /*
+                 * This happens when the module gets unloaded and doesn't
+                 * destroy its slab cache and no-one else reuses the vmalloc
+                 * area of the module.  Print a warning.
+                 */
+                res = probe_kernel_address(tmp->name, x);
+                if (res) {
+                        printk(KERN_ERR
+                               "SLAB: cache with size %d has lost its name\n",
+                               tmp->size);
+                        continue;
+                }
+                if (!strcmp(tmp->name, name)) {
+                        printk(KERN_ERR
+                               "SLAB: duplicate cache %s\n", name);
+                        dump_stack();
+                        return 0;
+                }
+        }
+        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
+        if (flags & SLAB_DESTROY_BY_RCU)
+                WARN_ON(flags & SLAB_POISON);
+        return 1;
+}
+struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+                size_t align, unsigned long flags, void (*ctor)(void *))
+{
+        struct kmem_cache *s;
+        down_write(&slqb_lock);
+        if (!kmem_cache_create_ok(name, size, align, flags))
+                goto err;
+        s = kmem_cache_alloc(&kmem_cache_cache, GFP_KERNEL);
+        if (!s)
+                goto err;
+        if (kmem_cache_open(s, name, size, align, flags, ctor, 1)) {
+                up_write(&slqb_lock);
+                return s;
+        }
+        kmem_cache_free(&kmem_cache_cache, s);
+err:
+        up_write(&slqb_lock);
+        if (flags & SLAB_PANIC)
+                panic("%s: failed to create slab `%s'\n", __func__, name);
+        return NULL;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+#ifdef CONFIG_SMP
+/*
+ * Use the cpu notifier to insure that the cpu slabs are flushed when
+ * necessary.
+ */
+static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
+                                unsigned long action, void *hcpu)
+{
+        long cpu = (long)hcpu;
+        struct kmem_cache *s;
+        switch (action) {
+        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
+                down_write(&slqb_lock);
+                list_for_each_entry(s, &slab_caches, list) {
+                        if (s->cpu_slab[cpu]) /* could be lefover last online */
+                                continue;
+                        s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu);
+                        if (!s->cpu_slab[cpu]) {
+                                up_read(&slqb_lock);
+                                return NOTIFY_BAD;
+                        }
+                }
+                up_write(&slqb_lock);
+                break;
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
+                start_cpu_timer(cpu);
+                break;
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                cancel_delayed_work_sync(&per_cpu(slqb_cache_trim_work,
+                                        cpu));
+                per_cpu(slqb_cache_trim_work, cpu).work.func = NULL;
+                break;
+        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                /*
+                 * XXX: Freeing here doesn't work because objects can still be
+                 * on this CPU's list. periodic timer needs to check if a CPU
+                 * is offline and then try to cleanup from there. Same for node
+                 * offline.
+                 */
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block __cpuinitdata slab_notifier = {
+        .notifier_call = slab_cpuup_callback
+};
+#endif
+#ifdef CONFIG_SLQB_DEBUG
+void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
+{
+        struct kmem_cache *s;
+        int node = -1;
+        s = get_slab(size, flags);
+        if (unlikely(ZERO_OR_NULL_PTR(s)))
+                return s;
+#ifdef CONFIG_NUMA
+        if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
+                node = alternate_nid(s, flags, node);
+#endif
+        return slab_alloc(s, flags, node, caller);
+}
+void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
+                                unsigned long caller)
+{
+        struct kmem_cache *s;
+        s = get_slab(size, flags);
+        if (unlikely(ZERO_OR_NULL_PTR(s)))
+                return s;
+        return slab_alloc(s, flags, node, caller);
+}
+#endif
+#if defined(CONFIG_SLQB_SYSFS) || defined(CONFIG_SLABINFO)
+struct stats_gather {
+        struct kmem_cache *s;
+        spinlock_t lock;
+        unsigned long nr_slabs;
+        unsigned long nr_partial;
+        unsigned long nr_inuse;
+        unsigned long nr_objects;
+#ifdef CONFIG_SLQB_STATS
+        unsigned long stats[NR_SLQB_STAT_ITEMS];
+#endif
+};
+static void __gather_stats(void *arg)
+{
+        unsigned long nr_slabs;
+        unsigned long nr_partial;
+        unsigned long nr_inuse;
+        struct stats_gather *gather = arg;
+        int cpu = smp_processor_id();
+        struct kmem_cache *s = gather->s;
+        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+        struct kmem_cache_list *l = &c->list;
+        struct slqb_page *page;
+#ifdef CONFIG_SLQB_STATS
+        int i;
+#endif
+        spin_lock(&l->page_lock);
+        nr_slabs = l->nr_slabs;
+        nr_partial = l->nr_partial;
+        nr_inuse = (nr_slabs - nr_partial) * s->objects;
+        list_for_each_entry(page, &l->partial, lru) {
+                nr_inuse += page->inuse;
+        }
+        spin_unlock(&l->page_lock);
+        spin_lock(&gather->lock);
+        gather->nr_slabs += nr_slabs;
+        gather->nr_partial += nr_partial;
+        gather->nr_inuse += nr_inuse;
+#ifdef CONFIG_SLQB_STATS
+        for (i = 0; i < NR_SLQB_STAT_ITEMS; i++)
+                gather->stats[i] += l->stats[i];
+#endif
+        spin_unlock(&gather->lock);
+}
+/* must be called with slqb_lock held */
+static void gather_stats_locked(struct kmem_cache *s,
+                                struct stats_gather *stats)
+{
+#ifdef CONFIG_NUMA
+        int node;
+#endif
+        memset(stats, 0, sizeof(struct stats_gather));
+        stats->s = s;
+        spin_lock_init(&stats->lock);
+        on_each_cpu(__gather_stats, stats, 1);
+#ifdef CONFIG_NUMA
+        for_each_online_node(node) {
+                struct kmem_cache_node *n = s->node_slab[node];
+                struct kmem_cache_list *l = &n->list;
+                struct slqb_page *page;
+                unsigned long flags;
+#ifdef CONFIG_SLQB_STATS
+                int i;
+#endif
+                spin_lock_irqsave(&n->list_lock, flags);
+#ifdef CONFIG_SLQB_STATS
+                for (i = 0; i < NR_SLQB_STAT_ITEMS; i++)
+                        stats->stats[i] += l->stats[i];
+#endif
+                stats->nr_slabs += l->nr_slabs;
+                stats->nr_partial += l->nr_partial;
+                stats->nr_inuse += (l->nr_slabs - l->nr_partial) * s->objects;
+                list_for_each_entry(page, &l->partial, lru) {
+                        stats->nr_inuse += page->inuse;
+                }
+                spin_unlock_irqrestore(&n->list_lock, flags);
+        }
+#endif
+        stats->nr_objects = stats->nr_slabs * s->objects;
+}
+#ifdef CONFIG_SLQB_SYSFS
+static void gather_stats(struct kmem_cache *s, struct stats_gather *stats)
+{
+        down_read(&slqb_lock); /* hold off hotplug */
+        gather_stats_locked(s, stats);
+        up_read(&slqb_lock);
+}
+#endif
+#endif
+/*
+ * The /proc/slabinfo ABI
+ */
+#ifdef CONFIG_SLABINFO
+#include <linux/proc_fs.h>
+ssize_t slabinfo_write(struct file *file, const char __user * buffer,
+                       size_t count, loff_t *ppos)
+{
+        return -EINVAL;
+}
+static void print_slabinfo_header(struct seq_file *m)
+{
+        seq_puts(m, "slabinfo - version: 2.1\n");
+        seq_puts(m, "# name         <active_objs> <num_objs> <objsize> "
+                 "<objperslab> <pagesperslab>");
+        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+        seq_putc(m, '\n');
+}
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+        loff_t n = *pos;
+        down_read(&slqb_lock);
+        if (!n)
+                print_slabinfo_header(m);
+        return seq_list_start(&slab_caches, *pos);
+}
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+        return seq_list_next(p, &slab_caches, pos);
+}
+static void s_stop(struct seq_file *m, void *p)
+{
+        up_read(&slqb_lock);
+}
+static int s_show(struct seq_file *m, void *p)
+{
+        struct stats_gather stats;
+        struct kmem_cache *s;
+        s = list_entry(p, struct kmem_cache, list);
+        gather_stats_locked(s, &stats);
+        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, stats.nr_inuse,
+                        stats.nr_objects, s->size, s->objects, (1 << s->order));
+        seq_printf(m, " : tunables %4u %4u %4u", slab_hiwater(s),
+                        slab_freebatch(s), 0);
+        seq_printf(m, " : slabdata %6lu %6lu %6lu", stats.nr_slabs,
+                        stats.nr_slabs, 0UL);
+        seq_putc(m, '\n');
+        return 0;
+}
+static const struct seq_operations slabinfo_op = {
+        .start = s_start,
+        .next = s_next,
+        .stop = s_stop,
+        .show = s_show,
+};
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &slabinfo_op);
+}
+static const struct file_operations proc_slabinfo_operations = {
+        .open           = slabinfo_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static int __init slab_proc_init(void)
+{
+        proc_create("slabinfo", S_IWUSR|S_IRUGO, NULL,
+                        &proc_slabinfo_operations);
+        return 0;
+}
+module_init(slab_proc_init);
+#endif /* CONFIG_SLABINFO */
+#ifdef CONFIG_SLQB_SYSFS
+/*
+ * sysfs API
+ */
+#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
+#define to_slab(n) container_of(n, struct kmem_cache, kobj);
+struct slab_attribute {
+        struct attribute attr;
+        ssize_t (*show)(struct kmem_cache *s, char *buf);
+        ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
+};
+#define SLAB_ATTR_RO(_name) \
+        static struct slab_attribute _name##_attr = __ATTR_RO(_name)
+#define SLAB_ATTR(_name) \
+        static struct slab_attribute _name##_attr =  \
+        __ATTR(_name, 0644, _name##_show, _name##_store)
+static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", s->size);
+}
+SLAB_ATTR_RO(slab_size);
+static ssize_t align_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", s->align);
+}
+SLAB_ATTR_RO(align);
+static ssize_t object_size_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", s->objsize);
+}
+SLAB_ATTR_RO(object_size);
+static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", s->objects);
+}
+SLAB_ATTR_RO(objs_per_slab);
+static ssize_t order_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", s->order);
+}
+SLAB_ATTR_RO(order);
+static ssize_t ctor_show(struct kmem_cache *s, char *buf)
+{
+        if (s->ctor) {
+                int n = sprint_symbol(buf, (unsigned long)s->ctor);
+                return n + sprintf(buf + n, "\n");
+        }
+        return 0;
+}
+SLAB_ATTR_RO(ctor);
+static ssize_t slabs_show(struct kmem_cache *s, char *buf)
+{
+        struct stats_gather stats;
+        gather_stats(s, &stats);
+        return sprintf(buf, "%lu\n", stats.nr_slabs);
+}
+SLAB_ATTR_RO(slabs);
+static ssize_t objects_show(struct kmem_cache *s, char *buf)
+{
+        struct stats_gather stats;
+        gather_stats(s, &stats);
+        return sprintf(buf, "%lu\n", stats.nr_inuse);
+}
+SLAB_ATTR_RO(objects);
+static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
+{
+        struct stats_gather stats;
+        gather_stats(s, &stats);
+        return sprintf(buf, "%lu\n", stats.nr_objects);
+}
+SLAB_ATTR_RO(total_objects);
+#ifdef CONFIG_FAILSLAB
+static ssize_t failslab_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+}
+static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
+                                                        size_t length)
+{
+        s->flags &= ~SLAB_FAILSLAB;
+        if (buf[0] == '1')
+                s->flags |= SLAB_FAILSLAB;
+        return length;
+}
+SLAB_ATTR(failslab);
+#endif
+static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
+}
+SLAB_ATTR_RO(reclaim_account);
+static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
+}
+SLAB_ATTR_RO(hwcache_align);
+#ifdef CONFIG_ZONE_DMA
+static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
+}
+SLAB_ATTR_RO(cache_dma);
+#endif
+static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
+}
+SLAB_ATTR_RO(destroy_by_rcu);
+static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
+}
+SLAB_ATTR_RO(red_zone);
+static ssize_t poison_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
+}
+SLAB_ATTR_RO(poison);
+static ssize_t store_user_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
+}
+SLAB_ATTR_RO(store_user);
+static ssize_t hiwater_store(struct kmem_cache *s,
+                                const char *buf, size_t length)
+{
+        long hiwater;
+        int err;
+        err = strict_strtol(buf, 10, &hiwater);
+        if (err)
+                return err;
+        if (hiwater < 0)
+                return -EINVAL;
+        s->hiwater = hiwater;
+        return length;
+}
+static ssize_t hiwater_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", slab_hiwater(s));
+}
+SLAB_ATTR(hiwater);
+static ssize_t freebatch_store(struct kmem_cache *s,
+                                const char *buf, size_t length)
+{
+        long freebatch;
+        int err;
+        err = strict_strtol(buf, 10, &freebatch);
+        if (err)
+                return err;
+        if (freebatch <= 0 || freebatch - 1 > s->hiwater)
+                return -EINVAL;
+        s->freebatch = freebatch;
+        return length;
+}
+static ssize_t freebatch_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", slab_freebatch(s));
+}
+SLAB_ATTR(freebatch);
+#ifdef CONFIG_SLQB_STATS
+static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
+{
+        struct stats_gather stats;
+        int len;
+#ifdef CONFIG_SMP
+        int cpu;
+#endif
+        gather_stats(s, &stats);
+        len = sprintf(buf, "%lu", stats.stats[si]);
+#ifdef CONFIG_SMP
+        for_each_online_cpu(cpu) {
+                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+                struct kmem_cache_list *l = &c->list;
+                if (len < PAGE_SIZE - 20)
+                        len += sprintf(buf+len, " C%d=%lu", cpu, l->stats[si]);
+        }
+#endif
+        return len + sprintf(buf + len, "\n");
+}
+#define STAT_ATTR(si, text)                                     \
+static ssize_t text##_show(struct kmem_cache *s, char *buf)     \
+{                                                               \
+        return show_stat(s, buf, si);                           \
+}                                                               \
+SLAB_ATTR_RO(text);                                             \
+STAT_ATTR(ALLOC, alloc);
+STAT_ATTR(ALLOC_SLAB_FILL, alloc_slab_fill);
+STAT_ATTR(ALLOC_SLAB_NEW, alloc_slab_new);
+STAT_ATTR(FREE, free);
+STAT_ATTR(FREE_REMOTE, free_remote);
+STAT_ATTR(FLUSH_FREE_LIST, flush_free_list);
+STAT_ATTR(FLUSH_FREE_LIST_OBJECTS, flush_free_list_objects);
+STAT_ATTR(FLUSH_FREE_LIST_REMOTE, flush_free_list_remote);
+STAT_ATTR(FLUSH_SLAB_PARTIAL, flush_slab_partial);
+STAT_ATTR(FLUSH_SLAB_FREE, flush_slab_free);
+STAT_ATTR(FLUSH_RFREE_LIST, flush_rfree_list);
+STAT_ATTR(FLUSH_RFREE_LIST_OBJECTS, flush_rfree_list_objects);
+STAT_ATTR(CLAIM_REMOTE_LIST, claim_remote_list);
+STAT_ATTR(CLAIM_REMOTE_LIST_OBJECTS, claim_remote_list_objects);
+#endif
+static struct attribute *slab_attrs[] = {
+        &slab_size_attr.attr,
+        &object_size_attr.attr,
+        &objs_per_slab_attr.attr,
+        &order_attr.attr,
+        &objects_attr.attr,
+        &total_objects_attr.attr,
+        &slabs_attr.attr,
+        &ctor_attr.attr,
+        &align_attr.attr,
+        &hwcache_align_attr.attr,
+        &reclaim_account_attr.attr,
+        &destroy_by_rcu_attr.attr,
+        &red_zone_attr.attr,
+        &poison_attr.attr,
+        &store_user_attr.attr,
+        &hiwater_attr.attr,
+        &freebatch_attr.attr,
+#ifdef CONFIG_ZONE_DMA
+        &cache_dma_attr.attr,
+#endif
+#ifdef CONFIG_SLQB_STATS
+        &alloc_attr.attr,
+        &alloc_slab_fill_attr.attr,
+        &alloc_slab_new_attr.attr,
+        &free_attr.attr,
+        &free_remote_attr.attr,
+        &flush_free_list_attr.attr,
+        &flush_free_list_objects_attr.attr,
+        &flush_free_list_remote_attr.attr,
+        &flush_slab_partial_attr.attr,
+        &flush_slab_free_attr.attr,
+        &flush_rfree_list_attr.attr,
+        &flush_rfree_list_objects_attr.attr,
+        &claim_remote_list_attr.attr,
+        &claim_remote_list_objects_attr.attr,
+#endif
+#ifdef CONFIG_FAILSLAB
+        &failslab_attr.attr,
+#endif
+        NULL
+};
+static struct attribute_group slab_attr_group = {
+        .attrs = slab_attrs,
+};
+static ssize_t slab_attr_show(struct kobject *kobj,
+                                struct attribute *attr, char *buf)
+{
+        struct slab_attribute *attribute;
+        struct kmem_cache *s;
+        int err;
+        attribute = to_slab_attr(attr);
+        s = to_slab(kobj);
+        if (!attribute->show)
+                return -EIO;
+        err = attribute->show(s, buf);
+        return err;
+}
+static ssize_t slab_attr_store(struct kobject *kobj,
+                        struct attribute *attr, const char *buf, size_t len)
+{
+        struct slab_attribute *attribute;
+        struct kmem_cache *s;
+        int err;
+        attribute = to_slab_attr(attr);
+        s = to_slab(kobj);
+        if (!attribute->store)
+                return -EIO;
+        err = attribute->store(s, buf, len);
+        return err;
+}
+static void kmem_cache_release(struct kobject *kobj)
+{
+        struct kmem_cache *s = to_slab(kobj);
+        kmem_cache_free(&kmem_cache_cache, s);
+}
+static struct sysfs_ops slab_sysfs_ops = {
+        .show = slab_attr_show,
+        .store = slab_attr_store,
+};
+static struct kobj_type slab_ktype = {
+        .sysfs_ops = &slab_sysfs_ops,
+        .release = kmem_cache_release
+};
+static int uevent_filter(struct kset *kset, struct kobject *kobj)
+{
+        struct kobj_type *ktype = get_ktype(kobj);
+        if (ktype == &slab_ktype)
+                return 1;
+        return 0;
+}
+static struct kset_uevent_ops slab_uevent_ops = {
+        .filter = uevent_filter,
+};
+static struct kset *slab_kset;
+static int sysfs_available __read_mostly;
+static int sysfs_slab_add(struct kmem_cache *s)
+{
+        int err;
+        if (!sysfs_available)
+                return 0;
+        s->kobj.kset = slab_kset;
+        err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, s->name);
+        if (err) {
+                kobject_put(&s->kobj);
+                return err;
+        }
+        err = sysfs_create_group(&s->kobj, &slab_attr_group);
+        if (err)
+                return err;
+        kobject_uevent(&s->kobj, KOBJ_ADD);
+        return 0;
+}
+static void sysfs_slab_remove(struct kmem_cache *s)
+{
+        kobject_uevent(&s->kobj, KOBJ_REMOVE);
+        kobject_del(&s->kobj);
+        kobject_put(&s->kobj);
+}
+static int __init slab_sysfs_init(void)
+{
+        struct kmem_cache *s;
+        int err;
+        slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
+        if (!slab_kset) {
+                printk(KERN_ERR "Cannot register slab subsystem.\n");
+                return -ENOSYS;
+        }
+        down_write(&slqb_lock);
+        sysfs_available = 1;
+        list_for_each_entry(s, &slab_caches, list) {
+                err = sysfs_slab_add(s);
+                if (err)
+                        printk(KERN_ERR "SLQB: Unable to add boot slab %s"
+                                                " to sysfs\n", s->name);
+        }
+        up_write(&slqb_lock);
+        return 0;
+}
+device_initcall(slab_sysfs_init);
+#endif
diff --git a/mm/slub.c b/mm/slub.c
index 35f351f2619..1796b6513ca 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1457,6 +1457,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
        struct zone *zone;
        enum zone_type high_zoneidx = gfp_zone(flags);
        struct page *page;
+        unsigned int cpuset_mems_cookie;
        /*
         * The defrag ratio allows a configuration of the tradeoffs between
@@ -1480,23 +1481,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
                        get_cycles() % 1024 > s->remote_node_defrag_ratio)
                return NULL;
-        get_mems_allowed();
+        do {
-        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+                cpuset_mems_cookie = get_mems_allowed();
-        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+                zonelist = node_zonelist(slab_node(current->mempolicy), flags);
-                struct kmem_cache_node *n;
+                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+                        struct kmem_cache_node *n;
-                n = get_node(s, zone_to_nid(zone));
+                        n = get_node(s, zone_to_nid(zone));
-                if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-                                n->nr_partial > s->min_partial) {
+                        if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-                        page = get_partial_node(n);
+                                        n->nr_partial > s->min_partial) {
-                        if (page) {
+                                page = get_partial_node(n);
-                                put_mems_allowed();
+                                if (page) {
-                                return page;
+                                        /*
+                                         * Return the object even if
+                                         * put_mems_allowed indicated that
+                                         * the cpuset mems_allowed was
+                                         * updated in parallel. It's a
+                                         * harmless race between the alloc
+                                         * and the cpuset update.
+                                         */
+                                        put_mems_allowed(cpuset_mems_cookie);
+                                        return page;
+                                }
                        }
                }
-        }
+        } while (!put_mems_allowed(cpuset_mems_cookie));
-        put_mems_allowed();
 #endif
        return NULL;
 }
@@ -1818,6 +1828,11 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
        if (unlikely(!node_match(c, node)))
                goto another_slab;
+        /* must check again c->freelist in case of cpu migration or IRQ */
+        object = c->freelist;
+        if (object)
+                goto update_freelist;
        stat(s, ALLOC_REFILL);
 load_freelist:
@@ -1827,6 +1842,7 @@ load_freelist:
        if (kmem_cache_debug(s))
                goto debug;
+update_freelist:
        c->freelist = get_freepointer(s, object);
        page->inuse = page->objects;
        page->freelist = NULL;
@@ -2163,7 +2179,7 @@ EXPORT_SYMBOL(kmem_cache_free);
 * take the list_lock.
 */
 static int slub_min_order;
-static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
+static int slub_max_order;
 static int slub_min_objects;
 /*
@@ -3433,13 +3449,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
                if (kmem_cache_open(s, n,
                                size, align, flags, ctor)) {
                        list_add(&s->list, &slab_caches);
+                        up_write(&slub_lock);
                        if (sysfs_slab_add(s)) {
+                                down_write(&slub_lock);
                                list_del(&s->list);
                                kfree(n);
                                kfree(s);
                                goto err;
                        }
-                        up_write(&slub_lock);
                        return s;
                }
                kfree(n);
diff --git a/mm/sparse.c b/mm/sparse.c
index aa64b12831a..4cd05e5f2f4 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
        usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
                                                                 usemap_count);
-        if (usemap) {
+        if (!usemap) {
-                for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+                usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
-                        if (!present_section_nr(pnum))
+                if (!usemap) {
-                                continue;
+                        printk(KERN_WARNING "%s: allocation failed\n", __func__);
-                        usemap_map[pnum] = usemap;
+                        return;
-                        usemap += size;
                }
-                return;
        }
-        usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
+        for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
-        if (usemap) {
+                if (!present_section_nr(pnum))
-                for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+                        continue;
-                        if (!present_section_nr(pnum))
+                usemap_map[pnum] = usemap;
-                                continue;
+                usemap += size;
-                        usemap_map[pnum] = usemap;
+                check_usemap_section_nr(nodeid, usemap_map[pnum]);
-                        usemap += size;
-                        check_usemap_section_nr(nodeid, usemap_map[pnum]);
-                }
-                return;
        }
-        printk(KERN_WARNING "%s: allocation failed\n", __func__);
 }
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/mm/swap.c b/mm/swap.c
index 3a442f18b0b..4a1fc6db89e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page)
 {
        if (unlikely(PageTail(page))) {
                /* __split_huge_page_refcount can run under us */
-                struct page *page_head = page->first_page;
+                struct page *page_head = compound_trans_head(page);
-                smp_rmb();
-                /*
+                if (likely(page != page_head &&
-                 * If PageTail is still set after smp_rmb() we can be sure
+                           get_page_unless_zero(page_head))) {
-                 * that the page->first_page we read wasn't a dangling pointer.
-                 * See __split_huge_page_refcount() smp_wmb().
-                 */
-                if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
                        unsigned long flags;
                        /*
-                         * Verify that our page_head wasn't converted
+                         * page_head wasn't a dangling pointer but it
-                         * to a a regular page before we got a
+                         * may not be a head page anymore by the time
-                         * reference on it.
+                         * we obtain the lock. That is ok as long as it
+                         * can't be freed from under us.
                         */
-                        if (unlikely(!PageHead(page_head))) {
-                                /* PageHead is cleared after PageTail */
-                                smp_rmb();
-                                VM_BUG_ON(PageTail(page));
-                                goto out_put_head;
-                        }
-                        /*
-                         * Only run compound_lock on a valid PageHead,
-                         * after having it pinned with
-                         * get_page_unless_zero() above.
-                         */
-                        smp_mb();
-                        /* page_head wasn't a dangling pointer */
                        flags = compound_lock_irqsave(page_head);
                        if (unlikely(!PageTail(page))) {
                                /* __split_huge_page_refcount run before us */
                                compound_unlock_irqrestore(page_head, flags);
                                VM_BUG_ON(PageHead(page_head));
-                        out_put_head:
                                if (put_page_testzero(page_head))
                                        __put_single_page(page_head);
                        out_put_single:
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page)
                        VM_BUG_ON(page_head != page->first_page);
                        /*
                         * We can release the refcount taken by
-                         * get_page_unless_zero now that
+                         * get_page_unless_zero() now that
-                         * split_huge_page_refcount is blocked on the
+                         * __split_huge_page_refcount() is blocked on
-                         * compound_lock.
+                         * the compound_lock.
                         */
                        if (put_page_testzero(page_head))
                                VM_BUG_ON(1);
                        /* __split_huge_page_refcount will wait now */
-                        VM_BUG_ON(atomic_read(&page->_count) <= 0);
+                        VM_BUG_ON(page_mapcount(page) <= 0);
-                        atomic_dec(&page->_count);
+                        atomic_dec(&page->_mapcount);
                        VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+                        VM_BUG_ON(atomic_read(&page->_count) != 0);
                        compound_unlock_irqrestore(page_head, flags);
                        if (put_page_testzero(page_head)) {
                                if (PageHead(page_head))
@@ -160,6 +144,45 @@ void put_page(struct page *page)
 }
 EXPORT_SYMBOL(put_page);
+/*
+ * This function is exported but must not be called by anything other
+ * than get_page(). It implements the slow path of get_page().
+ */
+bool __get_page_tail(struct page *page)
+{
+        /*
+         * This takes care of get_page() if run on a tail page
+         * returned by one of the get_user_pages/follow_page variants.
+         * get_user_pages/follow_page itself doesn't need the compound
+         * lock because it runs __get_page_tail_foll() under the
+         * proper PT lock that already serializes against
+         * split_huge_page().
+         */
+        unsigned long flags;
+        bool got = false;
+        struct page *page_head = compound_trans_head(page);
+        if (likely(page != page_head && get_page_unless_zero(page_head))) {
+                /*
+                 * page_head wasn't a dangling pointer but it
+                 * may not be a head page anymore by the time
+                 * we obtain the lock. That is ok as long as it
+                 * can't be freed from under us.
+                 */
+                flags = compound_lock_irqsave(page_head);
+                /* here __split_huge_page_refcount won't run anymore */
+                if (likely(PageTail(page))) {
+                        __get_page_tail_foll(page, false);
+                        got = true;
+                }
+                compound_unlock_irqrestore(page_head, flags);
+                if (unlikely(!got))
+                        put_page(page_head);
+        }
+        return got;
+}
+EXPORT_SYMBOL(__get_page_tail);
 /**
 * put_pages_list() - release a list of pages
 * @pages: list of pages threaded on page->lru
@@ -644,7 +667,7 @@ void lru_add_page_tail(struct zone* zone,
        VM_BUG_ON(!PageHead(page));
        VM_BUG_ON(PageCompound(page_tail));
        VM_BUG_ON(PageLRU(page_tail));
-        VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
+        VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock));
        SetPageLRU(page_tail);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 46680461785..10e9198778c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -28,7 +28,7 @@
 */
 static const struct address_space_operations swap_aops = {
        .writepage      = swap_writepage,
-        .set_page_dirty = __set_page_dirty_nobuffers,
+        .set_page_dirty = __set_page_dirty_no_writeback,
        .migratepage    = migrate_page,
 };
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ff8dc1a18cb..c8f4338848d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-                if (unlikely(pmd_trans_huge(*pmd)))
+                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
-                        continue;
-                if (pmd_none_or_clear_bad(pmd))
                        continue;
                ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
                if (ret)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1d34d75366a..bdb70042c12 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -256,7 +256,7 @@ struct vmap_area {
        struct rb_node rb_node;         /* address sorted rbtree */
        struct list_head list;          /* address sorted list */
        struct list_head purge_list;    /* "lazy purge" list */
-        void *private;
+        struct vm_struct *vm;
        struct rcu_head rcu_head;
 };
@@ -732,9 +732,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr)
 #define VMAP_BBMAP_BITS_MIN     (VMAP_MAX_ALLOC*2)
 #define VMAP_MIN(x, y)          ((x) < (y) ? (x) : (y)) /* can't use min() */
 #define VMAP_MAX(x, y)          ((x) > (y) ? (x) : (y)) /* can't use max() */
-#define VMAP_BBMAP_BITS         VMAP_MIN(VMAP_BBMAP_BITS_MAX,           \
+#define VMAP_BBMAP_BITS         \
-                                        VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
+                VMAP_MIN(VMAP_BBMAP_BITS_MAX,   \
-                                                VMALLOC_PAGES / NR_CPUS / 16))
+                VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
+                        VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
 #define VMAP_BLOCK_SIZE         (VMAP_BBMAP_BITS * PAGE_SIZE)
@@ -1173,9 +1174,10 @@ void __init vmalloc_init(void)
        /* Import existing vmlist entries. */
        for (tmp = vmlist; tmp; tmp = tmp->next) {
                va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
-                va->flags = tmp->flags | VM_VM_AREA;
+                va->flags = VM_VM_AREA;
                va->va_start = (unsigned long)tmp->addr;
                va->va_end = va->va_start + tmp->size;
+                va->vm = tmp;
                __insert_vmap_area(va);
        }
@@ -1266,18 +1268,22 @@ EXPORT_SYMBOL_GPL(map_vm_area);
 DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
-static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
                              unsigned long flags, void *caller)
 {
-        struct vm_struct *tmp, **p;
        vm->flags = flags;
        vm->addr = (void *)va->va_start;
        vm->size = va->va_end - va->va_start;
        vm->caller = caller;
-        va->private = vm;
+        va->vm = vm;
        va->flags |= VM_VM_AREA;
+}
+static void insert_vmalloc_vmlist(struct vm_struct *vm)
+{
+        struct vm_struct *tmp, **p;
+        vm->flags &= ~VM_UNLIST;
        write_lock(&vmlist_lock);
        for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
                if (tmp->addr >= vm->addr)
@@ -1288,6 +1294,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
        write_unlock(&vmlist_lock);
 }
+static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+                              unsigned long flags, void *caller)
+{
+        setup_vmalloc_vm(vm, va, flags, caller);
+        insert_vmalloc_vmlist(vm);
+}
 static struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long flags, unsigned long start,
                unsigned long end, int node, gfp_t gfp_mask, void *caller)
@@ -1326,7 +1339,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
                return NULL;
        }
-        insert_vmalloc_vm(area, va, flags, caller);
+        /*
+         * When this function is called from __vmalloc_node_range,
+         * we do not add vm_struct to vmlist here to avoid
+         * accessing uninitialized members of vm_struct such as
+         * pages and nr_pages fields. They will be set later.
+         * To distinguish it from others, we use a VM_UNLIST flag.
+         */
+        if (flags & VM_UNLIST)
+                setup_vmalloc_vm(area, va, flags, caller);
+        else
+                insert_vmalloc_vm(area, va, flags, caller);
        return area;
 }
@@ -1374,7 +1398,7 @@ static struct vm_struct *find_vm_area(const void *addr)
        va = find_vmap_area((unsigned long)addr);
        if (va && va->flags & VM_VM_AREA)
-                return va->private;
+                return va->vm;
        return NULL;
 }
@@ -1393,18 +1417,21 @@ struct vm_struct *remove_vm_area(const void *addr)
        va = find_vmap_area((unsigned long)addr);
        if (va && va->flags & VM_VM_AREA) {
-                struct vm_struct *vm = va->private;
+                struct vm_struct *vm = va->vm;
-                struct vm_struct *tmp, **p;
-                /*
+                if (!(vm->flags & VM_UNLIST)) {
-                 * remove from list and disallow access to this vm_struct
+                        struct vm_struct *tmp, **p;
-                 * before unmap. (address range confliction is maintained by
+                        /*
-                 * vmap.)
+                         * remove from list and disallow access to
-                 */
+                         * this vm_struct before unmap. (address range
-                write_lock(&vmlist_lock);
+                         * confliction is maintained by vmap.)
-                for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
+                         */
-                        ;
+                        write_lock(&vmlist_lock);
-                *p = tmp->next;
+                        for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
-                write_unlock(&vmlist_lock);
+                                ;
+                        *p = tmp->next;
+                        write_unlock(&vmlist_lock);
+                }
                vmap_debug_free_range(va->va_start, va->va_end);
                free_unmap_vmap_area(va);
@@ -1615,13 +1642,21 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
        if (!size || (size >> PAGE_SHIFT) > totalram_pages)
                return NULL;
-        area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
+        area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
-                                  gfp_mask, caller);
+                                  start, end, node, gfp_mask, caller);
        if (!area)
                return NULL;
        addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
+        if (!addr)
+                return NULL;
+        /*
+         * In this function, newly allocated vm_struct is not added
+         * to vmlist at __get_vm_area_node(). so, it is added here.
+         */
+        insert_vmalloc_vmlist(area);
        /*
         * A ref_count = 3 is needed because the vm_struct and vmap_area
@@ -2153,6 +2188,14 @@ struct vm_struct *alloc_vm_area(size_t size)
                return NULL;
        }
+        /*
+         * If the allocated address space is passed to a hypercall
+         * before being used then we cannot rely on a page fault to
+         * trigger an update of the page tables.  So sync all the page
+         * tables here.
+         */
+        vmalloc_sync_all();
        return area;
 }
 EXPORT_SYMBOL_GPL(alloc_vm_area);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d036e59d302..1eb3edf7920 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -248,35 +248,66 @@ unsigned long shrink_slab(struct shrink_control *shrink,
        list_for_each_entry(shrinker, &shrinker_list, list) {
                unsigned long long delta;
-                unsigned long total_scan;
+                long total_scan;
-                unsigned long max_pass;
+                long max_pass;
+                int shrink_ret = 0;
+                long nr;
+                long new_nr;
                max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+                if (max_pass <= 0)
+                        continue;
+                /*
+                 * copy the current shrinker scan count into a local variable
+                 * and zero it so that other concurrent shrinker invocations
+                 * don't also do this scanning work.
+                 */
+                do {
+                        nr = shrinker->nr;
+                } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+                total_scan = nr;
                delta = (4 * nr_pages_scanned) / shrinker->seeks;
                delta *= max_pass;
                do_div(delta, lru_pages + 1);
-                shrinker->nr += delta;
+                total_scan += delta;
-                if (shrinker->nr < 0) {
+                if (total_scan < 0) {
                        printk(KERN_ERR "shrink_slab: %pF negative objects to "
                               "delete nr=%ld\n",
-                               shrinker->shrink, shrinker->nr);
+                               shrinker->shrink, total_scan);
-                        shrinker->nr = max_pass;
+                        total_scan = max_pass;
                }
                /*
+                 * We need to avoid excessive windup on filesystem shrinkers
+                 * due to large numbers of GFP_NOFS allocations causing the
+                 * shrinkers to return -1 all the time. This results in a large
+                 * nr being built up so when a shrink that can do some work
+                 * comes along it empties the entire cache due to nr >>>
+                 * max_pass.  This is bad for sustaining a working set in
+                 * memory.
+                 *
+                 * Hence only allow the shrinker to scan the entire cache when
+                 * a large delta change is calculated directly.
+                 */
+                if (delta < max_pass / 4)
+                        total_scan = min(total_scan, max_pass / 2);
+                /*
                 * Avoid risking looping forever due to too large nr value:
                 * never try to free more than twice the estimate number of
                 * freeable entries.
                 */
-                if (shrinker->nr > max_pass * 2)
+                if (total_scan > max_pass * 2)
-                        shrinker->nr = max_pass * 2;
+                        total_scan = max_pass * 2;
-                total_scan = shrinker->nr;
+                trace_mm_shrink_slab_start(shrinker, shrink, nr,
-                shrinker->nr = 0;
+                                        nr_pages_scanned, lru_pages,
+                                        max_pass, delta, total_scan);
                while (total_scan >= SHRINK_BATCH) {
                        long this_scan = SHRINK_BATCH;
-                        int shrink_ret;
                        int nr_before;
                        nr_before = do_shrinker_shrink(shrinker, shrink, 0);
@@ -292,7 +323,19 @@ unsigned long shrink_slab(struct shrink_control *shrink,
                        cond_resched();
                }
-                shrinker->nr += total_scan;
+                /*
+                 * move the unused scan count back into the shrinker in a
+                 * manner that handles concurrent updates. If we exhausted the
+                 * scan, there is no need to do an update.
+                 */
+                do {
+                        nr = shrinker->nr;
+                        new_nr = total_scan + nr;
+                        if (total_scan <= 0)
+                                break;
+                } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+                trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
        }
        up_read(&shrinker_rwsem);
 out:
@@ -455,15 +498,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        return PAGE_ACTIVATE;
                }
-                /*
-                 * Wait on writeback if requested to. This happens when
-                 * direct reclaiming a large contiguous area and the
-                 * first attempt to free a range of pages fails.
-                 */
-                if (PageWriteback(page) &&
-                    (sc->reclaim_mode & RECLAIM_MODE_SYNC))
-                        wait_on_page_writeback(page);
                if (!PageWriteback(page)) {
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
@@ -581,6 +615,10 @@ void putback_lru_page(struct page *page)
        int was_unevictable = PageUnevictable(page);
        VM_BUG_ON(PageLRU(page));
+#ifdef CONFIG_CLEANCACHE
+        if (active)
+                SetPageWasActive(page);
+#endif
 redo:
        ClearPageUnevictable(page);
@@ -665,7 +703,7 @@ static enum page_references page_check_references(struct page *page,
                return PAGEREF_RECLAIM;
        if (referenced_ptes) {
-                if (PageAnon(page))
+                if (PageSwapBacked(page))
                        return PAGEREF_ACTIVATE;
                /*
                 * All mapped pages start out with page table
@@ -683,7 +721,13 @@ static enum page_references page_check_references(struct page *page,
                 */
                SetPageReferenced(page);
-                if (referenced_page)
+                if (referenced_page || referenced_ptes > 1)
+                        return PAGEREF_ACTIVATE;
+                /*
+                 * Activate file-backed executable pages after first usage.
+                 */
+                if (vm_flags & VM_EXEC)
                        return PAGEREF_ACTIVATE;
                return PAGEREF_KEEP;
@@ -719,7 +763,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
 */
 static unsigned long shrink_page_list(struct list_head *page_list,
                                      struct zone *zone,
-                                      struct scan_control *sc)
+                                      struct scan_control *sc,
+                                      int priority,
+                                      unsigned long *ret_nr_dirty,
+                                      unsigned long *ret_nr_writeback)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
@@ -727,6 +774,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        unsigned long nr_dirty = 0;
        unsigned long nr_congested = 0;
        unsigned long nr_reclaimed = 0;
+        unsigned long nr_writeback = 0;
        cond_resched();
@@ -763,13 +811,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
                if (PageWriteback(page)) {
+                        nr_writeback++;
                        /*
-                         * Synchronous reclaim is performed in two passes,
+                         * Synchronous reclaim cannot queue pages for
-                         * first an asynchronous pass over the list to
+                         * writeback due to the possibility of stack overflow
-                         * start parallel writeback, and a second synchronous
+                         * but if it encounters a page under writeback, wait
-                         * pass to wait for the IO to complete.  Wait here
+                         * for the IO to complete.
-                         * for any page for which writeback has already
-                         * started.
                         */
                        if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
                            may_enter_fs)
@@ -825,6 +872,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PageDirty(page)) {
                        nr_dirty++;
+                        /*
+                         * Only kswapd can writeback filesystem pages to
+                         * avoid risk of stack overflow but do not writeback
+                         * unless under significant pressure.
+                         */
+                        if (page_is_file_cache(page) &&
+                                        (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+                                /*
+                                 * Immediately reclaim when written back.
+                                 * Similar in principal to deactivate_page()
+                                 * except we already have the page isolated
+                                 * and know it's dirty
+                                 */
+                                inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+                                SetPageReclaim(page);
+                                goto keep_locked;
+                        }
                        if (references == PAGEREF_RECLAIM_CLEAN)
                                goto keep_locked;
                        if (!may_enter_fs)
@@ -959,6 +1025,8 @@ keep_lumpy:
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
+        *ret_nr_dirty += nr_dirty;
+        *ret_nr_writeback += nr_writeback;
        return nr_reclaimed;
 }
@@ -972,23 +1040,27 @@ keep_lumpy:
 *
 * returns 0 on success, -ve errno on failure.
 */
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 {
+        bool all_lru_mode;
        int ret = -EINVAL;
        /* Only take pages on the LRU. */
        if (!PageLRU(page))
                return ret;
+        all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+                (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
        /*
         * When checking the active state, we need to be sure we are
         * dealing with comparible boolean values.  Take the logical not
         * of each.
         */
-        if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+        if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
                return ret;
-        if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+        if (!all_lru_mode && !!page_is_file_cache(page) != file)
                return ret;
        /*
@@ -1001,6 +1073,43 @@ int __isolate_lru_page(struct page *page, int mode, int file)
        ret = -EBUSY;
+        /*
+         * To minimise LRU disruption, the caller can indicate that it only
+         * wants to isolate pages it will be able to operate on without
+         * blocking - clean pages for the most part.
+         *
+         * ISOLATE_CLEAN means that only clean pages should be isolated. This
+         * is used by reclaim when it is cannot write to backing storage
+         *
+         * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
+         * that it is possible to migrate without blocking
+         */
+        if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+                /* All the caller can do on PageWriteback is block */
+                if (PageWriteback(page))
+                        return ret;
+                if (PageDirty(page)) {
+                        struct address_space *mapping;
+                        /* ISOLATE_CLEAN means only clean pages */
+                        if (mode & ISOLATE_CLEAN)
+                                return ret;
+                        /*
+                         * Only pages without mappings or that have a
+                         * ->migratepage callback are possible to migrate
+                         * without blocking
+                         */
+                        mapping = page_mapping(page);
+                        if (mapping && !mapping->a_ops->migratepage)
+                                return ret;
+                }
+        }
+        if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+                return ret;
        if (likely(get_page_unless_zero(page))) {
                /*
                 * Be careful not to clear PageLRU until after we're
@@ -1036,7 +1145,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
 */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                struct list_head *src, struct list_head *dst,
-                unsigned long *scanned, int order, int mode, int file)
+                unsigned long *scanned, int order, isolate_mode_t mode,
+                int file)
 {
        unsigned long nr_taken = 0;
        unsigned long nr_lumpy_taken = 0;
@@ -1111,7 +1221,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                         * anon page which don't already have a swap slot is
                         * pointless.
                         */
-                        if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
+                        if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
                            !PageSwapCache(cursor_page))
                                break;
@@ -1161,8 +1271,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 static unsigned long isolate_pages_global(unsigned long nr,
                                        struct list_head *dst,
                                        unsigned long *scanned, int order,
-                                        int mode, struct zone *z,
+                                        isolate_mode_t mode,
-                                        int active, int file)
+                                        struct zone *z, int active, int file)
 {
        int lru = LRU_BASE;
        if (active)
@@ -1190,6 +1300,9 @@ static unsigned long clear_active_flags(struct list_head *page_list,
                if (PageActive(page)) {
                        lru += LRU_ACTIVE;
                        ClearPageActive(page);
+#ifdef CONFIG_CLEANCACHE
+                        SetPageWasActive(page);
+#endif
                        nr_active += numpages;
                }
                if (count)
@@ -1354,7 +1467,7 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
 }
 /*
- * Returns true if the caller should wait to clean dirty/writeback pages.
+ * Returns true if a direct reclaim should wait on pages under writeback.
 *
 * If we are direct reclaiming for contiguous pages and we do not reclaim
 * everything in the list, try again and wait for writeback IO to complete.
@@ -1408,6 +1521,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        unsigned long nr_taken;
        unsigned long nr_anon;
        unsigned long nr_file;
+        unsigned long nr_dirty = 0;
+        unsigned long nr_writeback = 0;
+        
+        isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
        while (unlikely(too_many_isolated(zone, file, sc))) {
                congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1418,15 +1535,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        }
        set_reclaim_mode(priority, sc, false);
+        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+                reclaim_mode |= ISOLATE_ACTIVE;
        lru_add_drain();
+        if (!sc->may_unmap)
+                reclaim_mode |= ISOLATE_UNMAPPED;
+        if (!sc->may_writepage)
+                reclaim_mode |= ISOLATE_CLEAN;
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
-                nr_taken = isolate_pages_global(nr_to_scan,
+                nr_taken = isolate_pages_global(nr_to_scan, &page_list,
-                        &page_list, &nr_scanned, sc->order,
+                        &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
-                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
-                        zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1435,12 +1558,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                               nr_scanned);
        } else {
-                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+                nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
-                        &page_list, &nr_scanned, sc->order,
+                        &nr_scanned, sc->order, reclaim_mode, zone,
-                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
+                        sc->mem_cgroup, 0, file);
-                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
-                        zone, sc->mem_cgroup,
-                        0, file);
                /*
                 * mem_cgroup_isolate_pages() keeps track of
                 * scanned pages on its own.
@@ -1456,12 +1576,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        spin_unlock_irq(&zone->lru_lock);
-        nr_reclaimed = shrink_page_list(&page_list, zone, sc);
+        nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
+                                                &nr_dirty, &nr_writeback);
        /* Check if we should syncronously wait for writeback */
        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
                set_reclaim_mode(priority, sc, true);
-                nr_reclaimed += shrink_page_list(&page_list, zone, sc);
+                nr_reclaimed += shrink_page_list(&page_list, zone, sc,
+                                        priority, &nr_dirty, &nr_writeback);
        }
        local_irq_disable();
@@ -1471,6 +1593,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+        /*
+         * If reclaim is isolating dirty pages under writeback, it implies
+         * that the long-lived page allocation rate is exceeding the page
+         * laundering rate. Either the global limits are not being effective
+         * at throttling processes due to the page distribution throughout
+         * zones or there is heavy usage of a slow backing device. The
+         * only option is to throttle from reclaim context which is not ideal
+         * as there is no guarantee the dirtying process is throttled in the
+         * same way balance_dirty_pages() manages.
+         *
+         * This scales the number of dirty pages that must be under writeback
+         * before throttling depending on priority. It is a simple backoff
+         * function that has the most effect in the range DEF_PRIORITY to
+         * DEF_PRIORITY-2 which is the priority reclaim is considered to be
+         * in trouble and reclaim is considered to be in trouble.
+         *
+         * DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle
+         * DEF_PRIORITY-1  50% must be PageWriteback
+         * DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble
+         * ...
+         * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
+         *                     isolated page is PageWriteback
+         */
+        if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
                zone_idx(zone),
                nr_scanned, nr_reclaimed,
@@ -1542,19 +1690,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        struct page *page;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        unsigned long nr_rotated = 0;
+        isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
        lru_add_drain();
+        if (!sc->may_unmap)
+                reclaim_mode |= ISOLATE_UNMAPPED;
+        if (!sc->may_writepage)
+                reclaim_mode |= ISOLATE_CLEAN;
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_pages, &l_hold,
                                                &pgscanned, sc->order,
-                                                ISOLATE_ACTIVE, zone,
+                                                reclaim_mode, zone,
                                                1, file);
                zone->pages_scanned += pgscanned;
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
                                                &pgscanned, sc->order,
-                                                ISOLATE_ACTIVE, zone,
+                                                reclaim_mode, zone,
                                                sc->mem_cgroup, 1, file);
                /*
                 * mem_cgroup_isolate_pages() keeps track of
@@ -1600,6 +1755,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                }
                ClearPageActive(page);  /* we are de-activating */
+#ifdef CONFIG_CLEANCACHE
+                SetPageWasActive(page);
+#endif
                list_add(&page->lru, &l_inactive);
        }
@@ -1747,22 +1905,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        u64 fraction[2], denominator;
        enum lru_list l;
        int noswap = 0;
-        int force_scan = 0;
+        bool force_scan = false;
+        unsigned long nr_force_scan[2];
-        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+        /* kswapd does zone balancing and needs to scan this zone */
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+        if (scanning_global_lru(sc) && current_is_kswapd() &&
-        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+            zone->all_unreclaimable)
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+                force_scan = true;
+        /* memcg may have small limit and need to avoid priority drop */
-        if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
+        if (!scanning_global_lru(sc))
-                /* kswapd does zone balancing and need to scan this zone */
+                force_scan = true;
-                if (scanning_global_lru(sc) && current_is_kswapd())
-                        force_scan = 1;
-                /* memcg may have small limit and need to avoid priority drop */
-                if (!scanning_global_lru(sc))
-                        force_scan = 1;
-        }
        /* If we have no swap space, do not bother scanning anon pages. */
        if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1770,9 +1922,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                fraction[0] = 0;
                fraction[1] = 1;
                denominator = 1;
+                nr_force_scan[0] = 0;
+                nr_force_scan[1] = SWAP_CLUSTER_MAX;
                goto out;
        }
+        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
        if (scanning_global_lru(sc)) {
                free  = zone_page_state(zone, NR_FREE_PAGES);
                /* If we have very few page cache pages,
@@ -1781,6 +1940,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                        fraction[0] = 1;
                        fraction[1] = 0;
                        denominator = 1;
+                        nr_force_scan[0] = SWAP_CLUSTER_MAX;
+                        nr_force_scan[1] = 0;
                        goto out;
                }
        }
@@ -1829,6 +1990,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        fraction[0] = ap;
        fraction[1] = fp;
        denominator = ap + fp + 1;
+        if (force_scan) {
+                unsigned long scan = SWAP_CLUSTER_MAX;
+                nr_force_scan[0] = div64_u64(scan * ap, denominator);
+                nr_force_scan[1] = div64_u64(scan * fp, denominator);
+        }
 out:
        for_each_evictable_lru(l) {
                int file = is_file_lru(l);
@@ -1849,12 +2015,8 @@ out:
                 * memcg, priority drop can cause big latency. So, it's better
                 * to scan small amount. See may_noscan above.
                 */
-                if (!scan && force_scan) {
+                if (!scan && force_scan)
-                        if (file)
+                        scan = nr_force_scan[file];
-                                scan = SWAP_CLUSTER_MAX;
-                        else if (!noswap)
-                                scan = SWAP_CLUSTER_MAX;
-                }
                nr[l] = scan;
        }
 }
@@ -1906,8 +2068,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
-        inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+        inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-                                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+        if (nr_swap_pages > 0)
+                inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
        if (sc->nr_reclaimed < pages_for_compaction &&
                        inactive_lru_pages > pages_for_compaction)
                return true;
@@ -1979,6 +2142,42 @@ restart:
        throttle_vm_writeout(sc->gfp_mask);
 }
+/* Returns true if compaction should go ahead for a high-order request */
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+{
+        unsigned long balance_gap, watermark;
+        bool watermark_ok;
+        /* Do not consider compaction for orders reclaim is meant to satisfy */
+        if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
+                return false;
+        /*
+         * Compaction takes time to run and there are potentially other
+         * callers using the pages just freed. Continue reclaiming until
+         * there is a buffer of free pages available to give compaction
+         * a reasonable chance of completing and allocating the page
+         */
+        balance_gap = min(low_wmark_pages(zone),
+                (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                        KSWAPD_ZONE_BALANCE_GAP_RATIO);
+        watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+        watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+        /*
+         * If compaction is deferred, reclaim up to a point where
+         * compaction will have a chance of success when re-enabled
+         */
+        if (compaction_deferred(zone))
+                return watermark_ok;
+        /* If compaction is not ready to start, keep reclaiming */
+        if (!compaction_suitable(zone, sc->order))
+                return false;
+        return watermark_ok;
+}
 /*
 * This is the direct reclaim path, for page-allocating processes.  We only
 * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -1994,14 +2193,20 @@ restart:
 *
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is ready to begin. This indicates to
+ * the caller that it should consider retrying the allocation instead of
+ * further reclaim.
 */
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
        struct zoneref *z;
        struct zone *zone;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
+        bool aborted_reclaim = false;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2016,6 +2221,21 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                                continue;
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
+                        if (COMPACTION_BUILD) {
+                                /*
+                                 * If we already have plenty of memory free for
+                                 * compaction in this zone, don't free any more.
+                                 * Even though compaction is invoked for any
+                                 * non-zero order, only frequent costly order
+                                 * reclamation is disruptive enough to become a
+                                 * noticable problem, like transparent huge page
+                                 * allocations.
+                                 */
+                                if (compaction_ready(zone, sc)) {
+                                        aborted_reclaim = true;
+                                        continue;
+                                }
+                        }
                        /*
                         * This steals pages from memory cgroups over softlimit
                         * and returns the number of reclaimed pages and
@@ -2033,6 +2253,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                shrink_zone(priority, zone, sc);
        }
+        return aborted_reclaim;
 }
 static bool zone_reclaimable(struct zone *zone)
@@ -2086,8 +2308,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        struct zoneref *z;
        struct zone *zone;
        unsigned long writeback_threshold;
+        bool aborted_reclaim;
-        get_mems_allowed();
        delayacct_freepages_start();
        if (scanning_global_lru(sc))
@@ -2097,7 +2319,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                sc->nr_scanned = 0;
                if (!priority)
                        disable_swap_token(sc->mem_cgroup);
-                shrink_zones(priority, zonelist, sc);
+                aborted_reclaim = shrink_zones(priority, zonelist, sc);
                /*
                 * Don't shrink slabs when reclaiming memory from
                 * over limit cgroups
@@ -2131,7 +2354,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                 */
                writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
                if (total_scanned > writeback_threshold) {
-                        wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
+                        wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
+                                                WB_REASON_TRY_TO_FREE_PAGES);
                        sc->may_writepage = 1;
                }
@@ -2149,7 +2373,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 out:
        delayacct_freepages_end();
-        put_mems_allowed();
        if (sc->nr_reclaimed)
                return sc->nr_reclaimed;
@@ -2162,6 +2385,10 @@ out:
        if (oom_killer_disabled)
                return 0;
+        /* Aborted reclaim to try compaction? don't OOM, then */
+        if (aborted_reclaim)
+                return 1;
        /* top priority shrink_zones still had more to do? don't OOM, then */
        if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
                return 1;
@@ -2453,6 +2680,9 @@ loop_again:
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
                                break;
+                        } else {
+                                /* If balanced, clear the congested flag */
+                                zone_clear_flag(zone, ZONE_CONGESTED);
                        }
                }
                if (i < 0)
@@ -2689,7 +2919,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
                 * them before going back to sleep.
                 */
                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
-                schedule();
+                if (!kthread_should_stop())
+                        schedule();
                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
        } else {
                if (remaining)
@@ -2716,7 +2949,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 static int kswapd(void *p)
 {
        unsigned long order, new_order;
+        unsigned balanced_order;
        int classzone_idx, new_classzone_idx;
+        int balanced_classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
@@ -2747,7 +2982,9 @@ static int kswapd(void *p)
        set_freezable();
        order = new_order = 0;
+        balanced_order = 0;
        classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+        balanced_classzone_idx = classzone_idx;
        for ( ; ; ) {
                int ret;
@@ -2756,7 +2993,8 @@ static int kswapd(void *p)
                 * new request of a similar or harder type will succeed soon
                 * so consider going to sleep on the basis we reclaimed at
                 */
-                if (classzone_idx >= new_classzone_idx && order == new_order) {
+                if (balanced_classzone_idx >= new_classzone_idx &&
+                                        balanced_order == new_order) {
                        new_order = pgdat->kswapd_max_order;
                        new_classzone_idx = pgdat->classzone_idx;
                        pgdat->kswapd_max_order =  0;
@@ -2771,9 +3009,12 @@ static int kswapd(void *p)
                        order = new_order;
                        classzone_idx = new_classzone_idx;
                } else {
-                        kswapd_try_to_sleep(pgdat, order, classzone_idx);
+                        kswapd_try_to_sleep(pgdat, balanced_order,
+                                                balanced_classzone_idx);
                        order = pgdat->kswapd_max_order;
                        classzone_idx = pgdat->classzone_idx;
+                        new_order = order;
+                        new_classzone_idx = classzone_idx;
                        pgdat->kswapd_max_order = 0;
                        pgdat->classzone_idx = pgdat->nr_zones - 1;
                }
@@ -2788,7 +3029,9 @@ static int kswapd(void *p)
                 */
                if (!ret) {
                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                        order = balance_pgdat(pgdat, order, &classzone_idx);
+                        balanced_classzone_idx = classzone_idx;
+                        balanced_order = balance_pgdat(pgdat, order,
+                                                &balanced_classzone_idx);
                }
        }
        return 0;
@@ -2946,14 +3189,17 @@ int kswapd_run(int nid)
 }
 /*
- * Called by memory hotplug when all memory in a node is offlined.
+ * Called by memory hotplug when all memory in a node is offlined.  Caller must
+ * hold lock_memory_hotplug().
 */
 void kswapd_stop(int nid)
 {
        struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
-        if (kswapd)
+        if (kswapd) {
                kthread_stop(kswapd);
+                NODE_DATA(nid)->kswapd = NULL;
+        }
 }
 static int __init kswapd_init(void)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7694b..9c001a268ab 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu)
 *
 * vm_stat contains the global counters
 */
-atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
 EXPORT_SYMBOL(vm_stat);
 #ifdef CONFIG_SMP
@@ -702,6 +702,7 @@ const char * const vmstat_text[] = {
        "nr_unstable",
        "nr_bounce",
        "nr_vmscan_write",
+        "nr_vmscan_immediate_reclaim",
        "nr_writeback_temp",
        "nr_isolated_anon",
        "nr_isolated_file",