44 files changed, 3794 insertions, 2358 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 8ca47a5ee9c..f2f1ca19ed5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -356,7 +356,7 @@ config CLEANCACHE
          for clean pages that the kernel's pageframe replacement algorithm
          (PFRA) would like to keep around, but can't since there isn't enough
          memory.  So when the PFRA "evicts" a page, it first attempts to use
-          cleancacne code to put the data contained in that page into
+          cleancache code to put the data contained in that page into
          "transcendent memory", memory that is not directly accessible or
          addressable by the kernel and is of unknown and possibly
          time-varying size.  And when a cleancache-enabled
diff --git a/mm/Makefile b/mm/Makefile
index 836e4163c1b..2d00bf57ca4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
 obj-$(CONFIG_SPARSEMEM) += sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
+obj-$(CONFIG_ASHMEM) += ashmem.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_COMPACTION) += compaction.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
diff --git a/mm/ashmem.c b/mm/ashmem.c
new file mode 100644
index 00000000000..66e3f23ee33
--- /dev/null
+++ b/mm/ashmem.c
@@ -0,0 +1,748 @@
+/* mm/ashmem.c
+**
+** Anonymous Shared Memory Subsystem, ashmem
+**
+** Copyright (C) 2008 Google, Inc.
+**
+** Robert Love <rlove@google.com>
+**
+** This software is licensed under the terms of the GNU General Public
+** License version 2, as published by the Free Software Foundation, and
+** may be copied, distributed, and modified under those terms.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+*/
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/security.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/personality.h>
+#include <linux/bitops.h>
+#include <linux/mutex.h>
+#include <linux/shmem_fs.h>
+#include <linux/ashmem.h>
+#define ASHMEM_NAME_PREFIX "dev/ashmem/"
+#define ASHMEM_NAME_PREFIX_LEN (sizeof(ASHMEM_NAME_PREFIX) - 1)
+#define ASHMEM_FULL_NAME_LEN (ASHMEM_NAME_LEN + ASHMEM_NAME_PREFIX_LEN)
+/*
+ * ashmem_area - anonymous shared memory area
+ * Lifecycle: From our parent file's open() until its release()
+ * Locking: Protected by `ashmem_mutex'
+ * Big Note: Mappings do NOT pin this structure; it dies on close()
+ */
+struct ashmem_area {
+        char name[ASHMEM_FULL_NAME_LEN];/* optional name for /proc/pid/maps */
+        struct list_head unpinned_list; /* list of all ashmem areas */
+        struct file *file;              /* the shmem-based backing file */
+        size_t size;                    /* size of the mapping, in bytes */
+        unsigned long prot_mask;        /* allowed prot bits, as vm_flags */
+};
+/*
+ * ashmem_range - represents an interval of unpinned (evictable) pages
+ * Lifecycle: From unpin to pin
+ * Locking: Protected by `ashmem_mutex'
+ */
+struct ashmem_range {
+        struct list_head lru;           /* entry in LRU list */
+        struct list_head unpinned;      /* entry in its area's unpinned list */
+        struct ashmem_area *asma;       /* associated area */
+        size_t pgstart;                 /* starting page, inclusive */
+        size_t pgend;                   /* ending page, inclusive */
+        unsigned int purged;            /* ASHMEM_NOT or ASHMEM_WAS_PURGED */
+};
+/* LRU list of unpinned pages, protected by ashmem_mutex */
+static LIST_HEAD(ashmem_lru_list);
+/* Count of pages on our LRU list, protected by ashmem_mutex */
+static unsigned long lru_count;
+/*
+ * ashmem_mutex - protects the list of and each individual ashmem_area
+ *
+ * Lock Ordering: ashmex_mutex -> i_mutex -> i_alloc_sem
+ */
+static DEFINE_MUTEX(ashmem_mutex);
+static struct kmem_cache *ashmem_area_cachep __read_mostly;
+static struct kmem_cache *ashmem_range_cachep __read_mostly;
+#define range_size(range) \
+  ((range)->pgend - (range)->pgstart + 1)
+#define range_on_lru(range) \
+  ((range)->purged == ASHMEM_NOT_PURGED)
+#define page_range_subsumes_range(range, start, end) \
+  (((range)->pgstart >= (start)) && ((range)->pgend <= (end)))
+#define page_range_subsumed_by_range(range, start, end) \
+  (((range)->pgstart <= (start)) && ((range)->pgend >= (end)))
+#define page_in_range(range, page) \
+ (((range)->pgstart <= (page)) && ((range)->pgend >= (page)))
+#define page_range_in_range(range, start, end) \
+  (page_in_range(range, start) || page_in_range(range, end) || \
+   page_range_subsumes_range(range, start, end))
+#define range_before_page(range, page) \
+  ((range)->pgend < (page))
+#define PROT_MASK               (PROT_EXEC | PROT_READ | PROT_WRITE)
+static inline void lru_add(struct ashmem_range *range)
+{
+        list_add_tail(&range->lru, &ashmem_lru_list);
+        lru_count += range_size(range);
+}
+static inline void lru_del(struct ashmem_range *range)
+{
+        list_del(&range->lru);
+        lru_count -= range_size(range);
+}
+/*
+ * range_alloc - allocate and initialize a new ashmem_range structure
+ *
+ * 'asma' - associated ashmem_area
+ * 'prev_range' - the previous ashmem_range in the sorted asma->unpinned list
+ * 'purged' - initial purge value (ASMEM_NOT_PURGED or ASHMEM_WAS_PURGED)
+ * 'start' - starting page, inclusive
+ * 'end' - ending page, inclusive
+ *
+ * Caller must hold ashmem_mutex.
+ */
+static int range_alloc(struct ashmem_area *asma,
+                       struct ashmem_range *prev_range, unsigned int purged,
+                       size_t start, size_t end)
+{
+        struct ashmem_range *range;
+        range = kmem_cache_zalloc(ashmem_range_cachep, GFP_KERNEL);
+        if (unlikely(!range))
+                return -ENOMEM;
+        range->asma = asma;
+        range->pgstart = start;
+        range->pgend = end;
+        range->purged = purged;
+        list_add_tail(&range->unpinned, &prev_range->unpinned);
+        if (range_on_lru(range))
+                lru_add(range);
+        return 0;
+}
+static void range_del(struct ashmem_range *range)
+{
+        list_del(&range->unpinned);
+        if (range_on_lru(range))
+                lru_del(range);
+        kmem_cache_free(ashmem_range_cachep, range);
+}
+/*
+ * range_shrink - shrinks a range
+ *
+ * Caller must hold ashmem_mutex.
+ */
+static inline void range_shrink(struct ashmem_range *range,
+                                size_t start, size_t end)
+{
+        size_t pre = range_size(range);
+        range->pgstart = start;
+        range->pgend = end;
+        if (range_on_lru(range))
+                lru_count -= pre - range_size(range);
+}
+static int ashmem_open(struct inode *inode, struct file *file)
+{
+        struct ashmem_area *asma;
+        int ret;
+        ret = generic_file_open(inode, file);
+        if (unlikely(ret))
+                return ret;
+        asma = kmem_cache_zalloc(ashmem_area_cachep, GFP_KERNEL);
+        if (unlikely(!asma))
+                return -ENOMEM;
+        INIT_LIST_HEAD(&asma->unpinned_list);
+        memcpy(asma->name, ASHMEM_NAME_PREFIX, ASHMEM_NAME_PREFIX_LEN);
+        asma->prot_mask = PROT_MASK;
+        file->private_data = asma;
+        return 0;
+}
+static int ashmem_release(struct inode *ignored, struct file *file)
+{
+        struct ashmem_area *asma = file->private_data;
+        struct ashmem_range *range, *next;
+        mutex_lock(&ashmem_mutex);
+        list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned)
+                range_del(range);
+        mutex_unlock(&ashmem_mutex);
+        if (asma->file)
+                fput(asma->file);
+        kmem_cache_free(ashmem_area_cachep, asma);
+        return 0;
+}
+static ssize_t ashmem_read(struct file *file, char __user *buf,
+                           size_t len, loff_t *pos)
+{
+        struct ashmem_area *asma = file->private_data;
+        int ret = 0;
+        mutex_lock(&ashmem_mutex);
+        /* If size is not set, or set to 0, always return EOF. */
+        if (asma->size == 0) {
+                goto out;
+        }
+        if (!asma->file) {
+                ret = -EBADF;
+                goto out;
+        }
+        ret = asma->file->f_op->read(asma->file, buf, len, pos);
+        if (ret < 0) {
+                goto out;
+        }
+        /** Update backing file pos, since f_ops->read() doesn't */
+        asma->file->f_pos = *pos;
+out:
+        mutex_unlock(&ashmem_mutex);
+        return ret;
+}
+static loff_t ashmem_llseek(struct file *file, loff_t offset, int origin)
+{
+        struct ashmem_area *asma = file->private_data;
+        int ret;
+        mutex_lock(&ashmem_mutex);
+        if (asma->size == 0) {
+                ret = -EINVAL;
+                goto out;
+        }
+        if (!asma->file) {
+                ret = -EBADF;
+                goto out;
+        }
+        ret = asma->file->f_op->llseek(asma->file, offset, origin);
+        if (ret < 0) {
+                goto out;
+        }
+        /** Copy f_pos from backing file, since f_ops->llseek() sets it */
+        file->f_pos = asma->file->f_pos;
+out:
+        mutex_unlock(&ashmem_mutex);
+        return ret;
+}
+static inline unsigned long
+calc_vm_may_flags(unsigned long prot)
+{
+        return _calc_vm_trans(prot, PROT_READ,  VM_MAYREAD ) |
+               _calc_vm_trans(prot, PROT_WRITE, VM_MAYWRITE) |
+               _calc_vm_trans(prot, PROT_EXEC,  VM_MAYEXEC);
+}
+static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        struct ashmem_area *asma = file->private_data;
+        int ret = 0;
+        mutex_lock(&ashmem_mutex);
+        /* user needs to SET_SIZE before mapping */
+        if (unlikely(!asma->size)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /* requested protection bits must match our allowed protection mask */
+        if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) &
+                                                calc_vm_prot_bits(PROT_MASK))) {
+                ret = -EPERM;
+                goto out;
+        }
+        vma->vm_flags &= ~calc_vm_may_flags(~asma->prot_mask);
+        if (!asma->file) {
+                char *name = ASHMEM_NAME_DEF;
+                struct file *vmfile;
+                if (asma->name[ASHMEM_NAME_PREFIX_LEN] != '\0')
+                        name = asma->name;
+                /* ... and allocate the backing shmem file */
+                vmfile = shmem_file_setup(name, asma->size, vma->vm_flags);
+                if (unlikely(IS_ERR(vmfile))) {
+                        ret = PTR_ERR(vmfile);
+                        goto out;
+                }
+                asma->file = vmfile;
+        }
+        get_file(asma->file);
+        if (vma->vm_flags & VM_SHARED)
+                shmem_set_file(vma, asma->file);
+        else {
+                if (vma->vm_file)
+                        fput(vma->vm_file);
+                vma->vm_file = asma->file;
+        }
+        vma->vm_flags |= VM_CAN_NONLINEAR;
+out:
+        mutex_unlock(&ashmem_mutex);
+        return ret;
+}
+/*
+ * ashmem_shrink - our cache shrinker, called from mm/vmscan.c :: shrink_slab
+ *
+ * 'nr_to_scan' is the number of objects (pages) to prune, or 0 to query how
+ * many objects (pages) we have in total.
+ *
+ * 'gfp_mask' is the mask of the allocation that got us into this mess.
+ *
+ * Return value is the number of objects (pages) remaining, or -1 if we cannot
+ * proceed without risk of deadlock (due to gfp_mask).
+ *
+ * We approximate LRU via least-recently-unpinned, jettisoning unpinned partial
+ * chunks of ashmem regions LRU-wise one-at-a-time until we hit 'nr_to_scan'
+ * pages freed.
+ */
+static int ashmem_shrink(struct shrinker *s, struct shrink_control *sc)
+{
+        struct ashmem_range *range, *next;
+        /* We might recurse into filesystem code, so bail out if necessary */
+        if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
+                return -1;
+        if (!sc->nr_to_scan)
+                return lru_count;
+        mutex_lock(&ashmem_mutex);
+        list_for_each_entry_safe(range, next, &ashmem_lru_list, lru) {
+                struct inode *inode = range->asma->file->f_dentry->d_inode;
+                loff_t start = range->pgstart * PAGE_SIZE;
+                loff_t end = (range->pgend + 1) * PAGE_SIZE - 1;
+                vmtruncate_range(inode, start, end);
+                range->purged = ASHMEM_WAS_PURGED;
+                lru_del(range);
+                sc->nr_to_scan -= range_size(range);
+                if (sc->nr_to_scan <= 0)
+                        break;
+        }
+        mutex_unlock(&ashmem_mutex);
+        return lru_count;
+}
+static struct shrinker ashmem_shrinker = {
+        .shrink = ashmem_shrink,
+        .seeks = DEFAULT_SEEKS * 4,
+};
+static int set_prot_mask(struct ashmem_area *asma, unsigned long prot)
+{
+        int ret = 0;
+        mutex_lock(&ashmem_mutex);
+        /* the user can only remove, not add, protection bits */
+        if (unlikely((asma->prot_mask & prot) != prot)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /* does the application expect PROT_READ to imply PROT_EXEC? */
+        if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
+                prot |= PROT_EXEC;
+        asma->prot_mask = prot;
+out:
+        mutex_unlock(&ashmem_mutex);
+        return ret;
+}
+static int set_name(struct ashmem_area *asma, void __user *name)
+{
+        int ret = 0;
+        mutex_lock(&ashmem_mutex);
+        /* cannot change an existing mapping's name */
+        if (unlikely(asma->file)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        if (unlikely(copy_from_user(asma->name + ASHMEM_NAME_PREFIX_LEN,
+                                    name, ASHMEM_NAME_LEN)))
+                ret = -EFAULT;
+        asma->name[ASHMEM_FULL_NAME_LEN-1] = '\0';
+out:
+        mutex_unlock(&ashmem_mutex);
+        return ret;
+}
+static int get_name(struct ashmem_area *asma, void __user *name)
+{
+        int ret = 0;
+        mutex_lock(&ashmem_mutex);
+        if (asma->name[ASHMEM_NAME_PREFIX_LEN] != '\0') {
+                size_t len;
+                /*
+                 * Copying only `len', instead of ASHMEM_NAME_LEN, bytes
+                 * prevents us from revealing one user's stack to another.
+                 */
+                len = strlen(asma->name + ASHMEM_NAME_PREFIX_LEN) + 1;
+                if (unlikely(copy_to_user(name,
+                                asma->name + ASHMEM_NAME_PREFIX_LEN, len)))
+                        ret = -EFAULT;
+        } else {
+                if (unlikely(copy_to_user(name, ASHMEM_NAME_DEF,
+                                          sizeof(ASHMEM_NAME_DEF))))
+                        ret = -EFAULT;
+        }
+        mutex_unlock(&ashmem_mutex);
+        return ret;
+}
+/*
+ * ashmem_pin - pin the given ashmem region, returning whether it was
+ * previously purged (ASHMEM_WAS_PURGED) or not (ASHMEM_NOT_PURGED).
+ *
+ * Caller must hold ashmem_mutex.
+ */
+static int ashmem_pin(struct ashmem_area *asma, size_t pgstart, size_t pgend)
+{
+        struct ashmem_range *range, *next;
+        int ret = ASHMEM_NOT_PURGED;
+        list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned) {
+                /* moved past last applicable page; we can short circuit */
+                if (range_before_page(range, pgstart))
+                        break;
+                /*
+                 * The user can ask us to pin pages that span multiple ranges,
+                 * or to pin pages that aren't even unpinned, so this is messy.
+                 *
+                 * Four cases:
+                 * 1. The requested range subsumes an existing range, so we
+                 *    just remove the entire matching range.
+                 * 2. The requested range overlaps the start of an existing
+                 *    range, so we just update that range.
+                 * 3. The requested range overlaps the end of an existing
+                 *    range, so we just update that range.
+                 * 4. The requested range punches a hole in an existing range,
+                 *    so we have to update one side of the range and then
+                 *    create a new range for the other side.
+                 */
+                if (page_range_in_range(range, pgstart, pgend)) {
+                        ret |= range->purged;
+                        /* Case #1: Easy. Just nuke the whole thing. */
+                        if (page_range_subsumes_range(range, pgstart, pgend)) {
+                                range_del(range);
+                                continue;
+                        }
+                        /* Case #2: We overlap from the start, so adjust it */
+                        if (range->pgstart >= pgstart) {
+                                range_shrink(range, pgend + 1, range->pgend);
+                                continue;
+                        }
+                        /* Case #3: We overlap from the rear, so adjust it */
+                        if (range->pgend <= pgend) {
+                                range_shrink(range, range->pgstart, pgstart-1);
+                                continue;
+                        }
+                        /*
+                         * Case #4: We eat a chunk out of the middle. A bit
+                         * more complicated, we allocate a new range for the
+                         * second half and adjust the first chunk's endpoint.
+                         */
+                        range_alloc(asma, range, range->purged,
+                                    pgend + 1, range->pgend);
+                        range_shrink(range, range->pgstart, pgstart - 1);
+                        break;
+                }
+        }
+        return ret;
+}
+/*
+ * ashmem_unpin - unpin the given range of pages. Returns zero on success.
+ *
+ * Caller must hold ashmem_mutex.
+ */
+static int ashmem_unpin(struct ashmem_area *asma, size_t pgstart, size_t pgend)
+{
+        struct ashmem_range *range, *next;
+        unsigned int purged = ASHMEM_NOT_PURGED;
+restart:
+        list_for_each_entry_safe(range, next, &asma->unpinned_list, unpinned) {
+                /* short circuit: this is our insertion point */
+                if (range_before_page(range, pgstart))
+                        break;
+                /*
+                 * The user can ask us to unpin pages that are already entirely
+                 * or partially pinned. We handle those two cases here.
+                 */
+                if (page_range_subsumed_by_range(range, pgstart, pgend))
+                        return 0;
+                if (page_range_in_range(range, pgstart, pgend)) {
+                        pgstart = min_t(size_t, range->pgstart, pgstart),
+                        pgend = max_t(size_t, range->pgend, pgend);
+                        purged |= range->purged;
+                        range_del(range);
+                        goto restart;
+                }
+        }
+        return range_alloc(asma, range, purged, pgstart, pgend);
+}
+/*
+ * ashmem_get_pin_status - Returns ASHMEM_IS_UNPINNED if _any_ pages in the
+ * given interval are unpinned and ASHMEM_IS_PINNED otherwise.
+ *
+ * Caller must hold ashmem_mutex.
+ */
+static int ashmem_get_pin_status(struct ashmem_area *asma, size_t pgstart,
+                                 size_t pgend)
+{
+        struct ashmem_range *range;
+        int ret = ASHMEM_IS_PINNED;
+        list_for_each_entry(range, &asma->unpinned_list, unpinned) {
+                if (range_before_page(range, pgstart))
+                        break;
+                if (page_range_in_range(range, pgstart, pgend)) {
+                        ret = ASHMEM_IS_UNPINNED;
+                        break;
+                }
+        }
+        return ret;
+}
+static int ashmem_pin_unpin(struct ashmem_area *asma, unsigned long cmd,
+                            void __user *p)
+{
+        struct ashmem_pin pin;
+        size_t pgstart, pgend;
+        int ret = -EINVAL;
+        if (unlikely(!asma->file))
+                return -EINVAL;
+        if (unlikely(copy_from_user(&pin, p, sizeof(pin))))
+                return -EFAULT;
+        /* per custom, you can pass zero for len to mean "everything onward" */
+        if (!pin.len)
+                pin.len = PAGE_ALIGN(asma->size) - pin.offset;
+        if (unlikely((pin.offset | pin.len) & ~PAGE_MASK))
+                return -EINVAL;
+        if (unlikely(((__u32) -1) - pin.offset < pin.len))
+                return -EINVAL;
+        if (unlikely(PAGE_ALIGN(asma->size) < pin.offset + pin.len))
+                return -EINVAL;
+        pgstart = pin.offset / PAGE_SIZE;
+        pgend = pgstart + (pin.len / PAGE_SIZE) - 1;
+        mutex_lock(&ashmem_mutex);
+        switch (cmd) {
+        case ASHMEM_PIN:
+                ret = ashmem_pin(asma, pgstart, pgend);
+                break;
+        case ASHMEM_UNPIN:
+                ret = ashmem_unpin(asma, pgstart, pgend);
+                break;
+        case ASHMEM_GET_PIN_STATUS:
+                ret = ashmem_get_pin_status(asma, pgstart, pgend);
+                break;
+        }
+        mutex_unlock(&ashmem_mutex);
+        return ret;
+}
+static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct ashmem_area *asma = file->private_data;
+        long ret = -ENOTTY;
+        switch (cmd) {
+        case ASHMEM_SET_NAME:
+                ret = set_name(asma, (void __user *) arg);
+                break;
+        case ASHMEM_GET_NAME:
+                ret = get_name(asma, (void __user *) arg);
+                break;
+        case ASHMEM_SET_SIZE:
+                ret = -EINVAL;
+                if (!asma->file) {
+                        ret = 0;
+                        asma->size = (size_t) arg;
+                }
+                break;
+        case ASHMEM_GET_SIZE:
+                ret = asma->size;
+                break;
+        case ASHMEM_SET_PROT_MASK:
+                ret = set_prot_mask(asma, arg);
+                break;
+        case ASHMEM_GET_PROT_MASK:
+                ret = asma->prot_mask;
+                break;
+        case ASHMEM_PIN:
+        case ASHMEM_UNPIN:
+        case ASHMEM_GET_PIN_STATUS:
+                ret = ashmem_pin_unpin(asma, cmd, (void __user *) arg);
+                break;
+        case ASHMEM_PURGE_ALL_CACHES:
+                ret = -EPERM;
+                if (capable(CAP_SYS_ADMIN)) {
+                        struct shrink_control sc = {
+                                .gfp_mask = GFP_KERNEL,
+                                .nr_to_scan = 0,
+                        };
+                        ret = ashmem_shrink(&ashmem_shrinker, &sc);
+                        sc.nr_to_scan = ret;
+                        ashmem_shrink(&ashmem_shrinker, &sc);
+                }
+                break;
+        }
+        return ret;
+}
+static struct file_operations ashmem_fops = {
+        .owner = THIS_MODULE,
+        .open = ashmem_open,
+        .release = ashmem_release,
+        .read = ashmem_read,
+        .llseek = ashmem_llseek,
+        .mmap = ashmem_mmap,
+        .unlocked_ioctl = ashmem_ioctl,
+        .compat_ioctl = ashmem_ioctl,
+};
+static struct miscdevice ashmem_misc = {
+        .minor = MISC_DYNAMIC_MINOR,
+        .name = "ashmem",
+        .fops = &ashmem_fops,
+};
+static int __init ashmem_init(void)
+{
+        int ret;
+        ashmem_area_cachep = kmem_cache_create("ashmem_area_cache",
+                                          sizeof(struct ashmem_area),
+                                          0, 0, NULL);
+        if (unlikely(!ashmem_area_cachep)) {
+                printk(KERN_ERR "ashmem: failed to create slab cache\n");
+                return -ENOMEM;
+        }
+        ashmem_range_cachep = kmem_cache_create("ashmem_range_cache",
+                                          sizeof(struct ashmem_range),
+                                          0, 0, NULL);
+        if (unlikely(!ashmem_range_cachep)) {
+                printk(KERN_ERR "ashmem: failed to create slab cache\n");
+                return -ENOMEM;
+        }
+        ret = misc_register(&ashmem_misc);
+        if (unlikely(ret)) {
+                printk(KERN_ERR "ashmem: failed to register misc device!\n");
+                return ret;
+        }
+        register_shrinker(&ashmem_shrinker);
+        printk(KERN_INFO "ashmem: initialized\n");
+        return 0;
+}
+static void __exit ashmem_exit(void)
+{
+        int ret;
+        unregister_shrinker(&ashmem_shrinker);
+        ret = misc_deregister(&ashmem_misc);
+        if (unlikely(ret))
+                printk(KERN_ERR "ashmem: failed to unregister misc device!\n");
+        kmem_cache_destroy(ashmem_range_cachep);
+        kmem_cache_destroy(ashmem_area_cachep);
+        printk(KERN_INFO "ashmem: unloaded\n");
+}
+module_init(ashmem_init);
+module_exit(ashmem_exit);
+MODULE_LICENSE("GPL");
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f032e6e1e09..253b071b7d9 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer;
 static int bdi_sync_supers(void *);
 static void sync_supers_timer_fn(unsigned long);
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+{
+        if (wb1 < wb2) {
+                spin_lock(&wb1->list_lock);
+                spin_lock_nested(&wb2->list_lock, 1);
+        } else {
+                spin_lock(&wb2->list_lock);
+                spin_lock_nested(&wb1->list_lock, 1);
+        }
+}
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -67,34 +78,42 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
        struct inode *inode;
        nr_dirty = nr_io = nr_more_io = 0;
-        spin_lock(&inode_wb_list_lock);
+        spin_lock(&wb->list_lock);
        list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
                nr_dirty++;
        list_for_each_entry(inode, &wb->b_io, i_wb_list)
                nr_io++;
        list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
                nr_more_io++;
-        spin_unlock(&inode_wb_list_lock);
+        spin_unlock(&wb->list_lock);
        global_dirty_limits(&background_thresh, &dirty_thresh);
        bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
 #define K(x) ((x) << (PAGE_SHIFT - 10))
        seq_printf(m,
-                   "BdiWriteback:     %8lu kB\n"
+                   "BdiWriteback:       %10lu kB\n"
-                   "BdiReclaimable:   %8lu kB\n"
+                   "BdiReclaimable:     %10lu kB\n"
-                   "BdiDirtyThresh:   %8lu kB\n"
+                   "BdiDirtyThresh:     %10lu kB\n"
-                   "DirtyThresh:      %8lu kB\n"
+                   "DirtyThresh:        %10lu kB\n"
-                   "BackgroundThresh: %8lu kB\n"
+                   "BackgroundThresh:   %10lu kB\n"
-                   "b_dirty:          %8lu\n"
+                   "BdiWritten:         %10lu kB\n"
-                   "b_io:             %8lu\n"
+                   "BdiWriteBandwidth:  %10lu kBps\n"
-                   "b_more_io:        %8lu\n"
+                   "b_dirty:            %10lu\n"
-                   "bdi_list:         %8u\n"
+                   "b_io:               %10lu\n"
-                   "state:            %8lx\n",
+                   "b_more_io:          %10lu\n"
+                   "bdi_list:           %10u\n"
+                   "state:              %10lx\n",
                   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
                   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
-                   K(bdi_thresh), K(dirty_thresh),
+                   K(bdi_thresh),
-                   K(background_thresh), nr_dirty, nr_io, nr_more_io,
+                   K(dirty_thresh),
+                   K(background_thresh),
+                   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+                   (unsigned long) K(bdi->write_bandwidth),
+                   nr_dirty,
+                   nr_io,
+                   nr_more_io,
                   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
@@ -249,18 +268,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
        return wb_has_dirty_io(&bdi->wb);
 }
-static void bdi_flush_io(struct backing_dev_info *bdi)
-{
-        struct writeback_control wbc = {
-                .sync_mode              = WB_SYNC_NONE,
-                .older_than_this        = NULL,
-                .range_cyclic           = 1,
-                .nr_to_write            = 1024,
-        };
-        writeback_inodes_wb(&bdi->wb, &wbc);
-}
 /*
 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
 * or we risk deadlocking on ->s_umount. The longer term solution would be
@@ -352,6 +359,17 @@ static unsigned long bdi_longest_inactive(void)
        return max(5UL * 60 * HZ, interval);
 }
+/*
+ * Clear pending bit and wakeup anybody waiting for flusher thread creation or
+ * shutdown
+ */
+static void bdi_clear_pending(struct backing_dev_info *bdi)
+{
+        clear_bit(BDI_pending, &bdi->state);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&bdi->state, BDI_pending);
+}
 static int bdi_forker_thread(void *ptr)
 {
        struct bdi_writeback *me = ptr;
@@ -383,6 +401,13 @@ static int bdi_forker_thread(void *ptr)
                }
                spin_lock_bh(&bdi_lock);
+                /*
+                 * In the following loop we are going to check whether we have
+                 * some work to do without any synchronization with tasks
+                 * waking us up to do work for them. So we have to set task
+                 * state already here so that we don't miss wakeups coming
+                 * after we verify some condition.
+                 */
                set_current_state(TASK_INTERRUPTIBLE);
                list_for_each_entry(bdi, &bdi_list, bdi_list) {
@@ -446,9 +471,10 @@ static int bdi_forker_thread(void *ptr)
                        if (IS_ERR(task)) {
                                /*
                                 * If thread creation fails, force writeout of
-                                 * the bdi from the thread.
+                                 * the bdi from the thread. Hopefully 1024 is
+                                 * large enough for efficient IO.
                                 */
-                                bdi_flush_io(bdi);
+                                writeback_inodes_wb(&bdi->wb, 1024);
                        } else {
                                /*
                                 * The spinlock makes sure we do not lose
@@ -461,11 +487,13 @@ static int bdi_forker_thread(void *ptr)
                                spin_unlock_bh(&bdi->wb_lock);
                                wake_up_process(task);
                        }
+                        bdi_clear_pending(bdi);
                        break;
                case KILL_THREAD:
                        __set_current_state(TASK_RUNNING);
                        kthread_stop(task);
+                        bdi_clear_pending(bdi);
                        break;
                case NO_ACTION:
@@ -481,16 +509,8 @@ static int bdi_forker_thread(void *ptr)
                        else
                                schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
                        try_to_freeze();
-                        /* Back to the main loop */
+                        break;
-                        continue;
                }
-                /*
-                 * Clear pending bit and wakeup anybody waiting to tear us down.
-                 */
-                clear_bit(BDI_pending, &bdi->state);
-                smp_mb__after_clear_bit();
-                wake_up_bit(&bdi->state, BDI_pending);
        }
        return 0;
@@ -505,7 +525,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
        list_del_rcu(&bdi->bdi_list);
        spin_unlock_bh(&bdi_lock);
-        synchronize_rcu();
+        synchronize_rcu_expedited();
 }
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -606,6 +626,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
 void bdi_unregister(struct backing_dev_info *bdi)
 {
        if (bdi->dev) {
+                bdi_set_min_ratio(bdi, 0);
                trace_writeback_bdi_unregister(bdi);
                bdi_prune_sb(bdi);
                del_timer_sync(&bdi->wb.wakeup_timer);
@@ -628,9 +649,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
        INIT_LIST_HEAD(&wb->b_dirty);
        INIT_LIST_HEAD(&wb->b_io);
        INIT_LIST_HEAD(&wb->b_more_io);
+        spin_lock_init(&wb->list_lock);
        setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
 }
+/*
+ * Initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW         (100 << (20 - PAGE_SHIFT))
 int bdi_init(struct backing_dev_info *bdi)
 {
        int i, err;
@@ -653,6 +680,13 @@ int bdi_init(struct backing_dev_info *bdi)
        }
        bdi->dirty_exceeded = 0;
+        bdi->bw_time_stamp = jiffies;
+        bdi->written_stamp = 0;
+        bdi->write_bandwidth = INIT_BW;
+        bdi->avg_write_bandwidth = INIT_BW;
        err = prop_local_init_percpu(&bdi->completions);
        if (err) {
@@ -676,15 +710,24 @@ void bdi_destroy(struct backing_dev_info *bdi)
        if (bdi_has_dirty_io(bdi)) {
                struct bdi_writeback *dst = &default_backing_dev_info.wb;
-                spin_lock(&inode_wb_list_lock);
+                bdi_lock_two(&bdi->wb, dst);
                list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
                list_splice(&bdi->wb.b_io, &dst->b_io);
                list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
-                spin_unlock(&inode_wb_list_lock);
+                spin_unlock(&bdi->wb.list_lock);
+                spin_unlock(&dst->list_lock);
        }
        bdi_unregister(bdi);
+        /*
+         * If bdi_unregister() had already been called earlier, the
+         * wakeup_timer could still be armed because bdi_prune_sb()
+         * can race with the bdi_wakeup_thread_delayed() calls from
+         * __mark_inode_dirty().
+         */
+        del_timer_sync(&bdi->wb.wakeup_timer);
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
                percpu_counter_destroy(&bdi->bdi_stat[i]);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 03bf3bb4519..fbb58e34688 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -500,7 +500,7 @@ void dmam_pool_destroy(struct dma_pool *pool)
 {
        struct device *dev = pool->dev;
-        dma_pool_destroy(pool);
        WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
+        dma_pool_destroy(pool);
 }
 EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/failslab.c b/mm/failslab.c
index c5f88f240dd..0dd7b8fec71 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -5,10 +5,6 @@ static struct {
        struct fault_attr attr;
        u32 ignore_gfp_wait;
        int cache_filter;
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-        struct dentry *ignore_gfp_wait_file;
-        struct dentry *cache_filter_file;
-#endif
 } failslab = {
        .attr = FAULT_ATTR_INITIALIZER,
        .ignore_gfp_wait = 1,
@@ -38,32 +34,25 @@ __setup("failslab=", setup_failslab);
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init failslab_debugfs_init(void)
 {
-        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
        struct dentry *dir;
-        int err;
+        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
-        err = init_fault_attr_dentries(&failslab.attr, "failslab");
-        if (err)
-                return err;
-        dir = failslab.attr.dentries.dir;
-        failslab.ignore_gfp_wait_file =
+        dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
-                debugfs_create_bool("ignore-gfp-wait", mode, dir,
+        if (IS_ERR(dir))
-                                      &failslab.ignore_gfp_wait);
+                return PTR_ERR(dir);
-        failslab.cache_filter_file =
+        if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
-                debugfs_create_bool("cache-filter", mode, dir,
+                                &failslab.ignore_gfp_wait))
-                                      &failslab.cache_filter);
+                goto fail;
+        if (!debugfs_create_bool("cache-filter", mode, dir,
+                                &failslab.cache_filter))
+                goto fail;
-        if (!failslab.ignore_gfp_wait_file ||
+        return 0;
-            !failslab.cache_filter_file) {
+fail:
-                err = -ENOMEM;
+        debugfs_remove_recursive(dir);
-                debugfs_remove(failslab.cache_filter_file);
-                debugfs_remove(failslab.ignore_gfp_wait_file);
-                cleanup_fault_attr_dentries(&failslab.attr);
-        }
-        return err;
+        return -ENOMEM;
 }
 late_initcall(failslab_debugfs_init);
diff --git a/mm/filemap.c b/mm/filemap.c
index a8251a8d345..0eedbf85062 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,7 +33,6 @@
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
-#include <linux/mm_inline.h> /* for page_is_file_cache() */
 #include <linux/cleancache.h>
 #include "internal.h"
@@ -78,10 +77,7 @@
 *  ->i_mutex                   (generic_file_buffered_write)
 *    ->mmap_sem                (fault_in_pages_readable->do_page_fault)
 *
- *  ->i_mutex
+ *  bdi->wb.list_lock
- *    ->i_alloc_sem             (various)
- *
- *  inode_wb_list_lock
 *    sb_lock                   (fs/fs-writeback.c)
 *    ->mapping->tree_lock      (__sync_single_inode)
 *
@@ -99,9 +95,9 @@
 *    ->zone.lru_lock           (check_pte_range->isolate_lru_page)
 *    ->private_lock            (page_remove_rmap->set_page_dirty)
 *    ->tree_lock               (page_remove_rmap->set_page_dirty)
- *    inode_wb_list_lock        (page_remove_rmap->set_page_dirty)
+ *    bdi.wb->list_lock         (page_remove_rmap->set_page_dirty)
 *    ->inode->i_lock           (page_remove_rmap->set_page_dirty)
- *    inode_wb_list_lock        (zap_pte_range->set_page_dirty)
+ *    bdi.wb->list_lock         (zap_pte_range->set_page_dirty)
 *    ->inode->i_lock           (zap_pte_range->set_page_dirty)
 *    ->private_lock            (zap_pte_range->__set_page_dirty_buffers)
 *
@@ -131,6 +127,7 @@ void __delete_from_page_cache(struct page *page)
        radix_tree_delete(&mapping->page_tree, page->index);
        page->mapping = NULL;
+        /* Leave page->index set: truncation lookup relies upon it */
        mapping->nrpages--;
        __dec_zone_page_state(page, NR_FILE_PAGES);
        if (PageSwapBacked(page))
@@ -396,24 +393,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 {
        int error;
-        struct mem_cgroup *memcg = NULL;
        VM_BUG_ON(!PageLocked(old));
        VM_BUG_ON(!PageLocked(new));
        VM_BUG_ON(new->mapping);
-        /*
-         * This is not page migration, but prepare_migration and
-         * end_migration does enough work for charge replacement.
-         *
-         * In the longer term we probably want a specialized function
-         * for moving the charge from old to new in a more efficient
-         * manner.
-         */
-        error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
-        if (error)
-                return error;
        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (!error) {
                struct address_space *mapping = old->mapping;
@@ -435,13 +419,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                if (PageSwapBacked(new))
                        __inc_zone_page_state(new, NR_SHMEM);
                spin_unlock_irq(&mapping->tree_lock);
+                /* mem_cgroup codes must not be called under tree_lock */
+                mem_cgroup_replace_page_cache(old, new);
                radix_tree_preload_end();
                if (freepage)
                        freepage(old);
                page_cache_release(old);
-                mem_cgroup_end_migration(memcg, old, new, true);
-        } else {
-                mem_cgroup_end_migration(memcg, old, new, false);
        }
        return error;
@@ -464,6 +447,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
        int error;
        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON(PageSwapBacked(page));
        error = mem_cgroup_cache_charge(page, current->mm,
                                        gfp_mask & GFP_RECLAIM_MASK);
@@ -481,11 +465,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                if (likely(!error)) {
                        mapping->nrpages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                        if (PageSwapBacked(page))
-                                __inc_zone_page_state(page, NR_SHMEM);
                        spin_unlock_irq(&mapping->tree_lock);
                } else {
                        page->mapping = NULL;
+                        /* Leave page->index set: truncation relies upon it */
                        spin_unlock_irq(&mapping->tree_lock);
                        mem_cgroup_uncharge_cache_page(page);
                        page_cache_release(page);
@@ -503,22 +486,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 {
        int ret;
-        /*
-         * Splice_read and readahead add shmem/tmpfs pages into the page cache
-         * before shmem_readpage has a chance to mark them as SwapBacked: they
-         * need to go on the anon lru below, and mem_cgroup_cache_charge
-         * (called in add_to_page_cache) needs to know where they're going too.
-         */
-        if (mapping_cap_swap_backed(mapping))
-                SetPageSwapBacked(page);
        ret = add_to_page_cache(page, mapping, offset, gfp_mask);
-        if (ret == 0) {
+        if (ret == 0)
-                if (page_is_file_cache(page))
+                lru_cache_add_file(page);
-                        lru_cache_add_file(page);
-                else
-                        lru_cache_add_anon(page);
-        }
        return ret;
 }
 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
@@ -715,9 +685,16 @@ repeat:
                page = radix_tree_deref_slot(pagep);
                if (unlikely(!page))
                        goto out;
-                if (radix_tree_deref_retry(page))
+                if (radix_tree_exception(page)) {
-                        goto repeat;
+                        if (radix_tree_deref_retry(page))
+                                goto repeat;
+                        /*
+                         * Otherwise, shmem/tmpfs must be storing a swap entry
+                         * here as an exceptional entry: so return it without
+                         * attempting to raise page count.
+                         */
+                        goto out;
+                }
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -754,7 +731,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
 repeat:
        page = find_get_page(mapping, offset);
-        if (page) {
+        if (page && !radix_tree_exception(page)) {
                lock_page(page);
                /* Has the page been truncated? */
                if (unlikely(page->mapping != mapping)) {
@@ -836,13 +813,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 {
        unsigned int i;
        unsigned int ret;
-        unsigned int nr_found;
+        unsigned int nr_found, nr_skip;
        rcu_read_lock();
 restart:
        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                                (void ***)pages, start, nr_pages);
+                                (void ***)pages, NULL, start, nr_pages);
        ret = 0;
+        nr_skip = 0;
        for (i = 0; i < nr_found; i++) {
                struct page *page;
 repeat:
@@ -850,13 +828,23 @@ repeat:
                if (unlikely(!page))
                        continue;
-                /*
+                if (radix_tree_exception(page)) {
-                 * This can only trigger when the entry at index 0 moves out
+                        if (radix_tree_deref_retry(page)) {
-                 * of or back to the root: none yet gotten, safe to restart.
+                                /*
-                 */
+                                 * Transient condition which can only trigger
-                if (radix_tree_deref_retry(page)) {
+                                 * when entry at index 0 moves out of or back
-                        WARN_ON(start | i);
+                                 * to root: none yet gotten, safe to restart.
-                        goto restart;
+                                 */
+                                WARN_ON(start | i);
+                                goto restart;
+                        }
+                        /*
+                         * Otherwise, shmem/tmpfs must be storing a swap entry
+                         * here as an exceptional entry: so skip over it -
+                         * we only reach this from invalidate_mapping_pages().
+                         */
+                        nr_skip++;
+                        continue;
                }
                if (!page_cache_get_speculative(page))
@@ -876,7 +864,7 @@ repeat:
         * If all entries were removed before we could secure them,
         * try again, because callers stop trying once 0 is returned.
         */
-        if (unlikely(!ret && nr_found))
+        if (unlikely(!ret && nr_found > nr_skip))
                goto restart;
        rcu_read_unlock();
        return ret;
@@ -904,7 +892,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
        rcu_read_lock();
 restart:
        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                                (void ***)pages, index, nr_pages);
+                                (void ***)pages, NULL, index, nr_pages);
        ret = 0;
        for (i = 0; i < nr_found; i++) {
                struct page *page;
@@ -913,12 +901,22 @@ repeat:
                if (unlikely(!page))
                        continue;
-                /*
+                if (radix_tree_exception(page)) {
-                 * This can only trigger when the entry at index 0 moves out
+                        if (radix_tree_deref_retry(page)) {
-                 * of or back to the root: none yet gotten, safe to restart.
+                                /*
-                 */
+                                 * Transient condition which can only trigger
-                if (radix_tree_deref_retry(page))
+                                 * when entry at index 0 moves out of or back
-                        goto restart;
+                                 * to root: none yet gotten, safe to restart.
+                                 */
+                                goto restart;
+                        }
+                        /*
+                         * Otherwise, shmem/tmpfs must be storing a swap entry
+                         * here as an exceptional entry: so stop looking for
+                         * contiguous pages.
+                         */
+                        break;
+                }
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -978,12 +976,21 @@ repeat:
                if (unlikely(!page))
                        continue;
-                /*
+                if (radix_tree_exception(page)) {
-                 * This can only trigger when the entry at index 0 moves out
+                        if (radix_tree_deref_retry(page)) {
-                 * of or back to the root: none yet gotten, safe to restart.
+                                /*
-                 */
+                                 * Transient condition which can only trigger
-                if (radix_tree_deref_retry(page))
+                                 * when entry at index 0 moves out of or back
-                        goto restart;
+                                 * to root: none yet gotten, safe to restart.
+                                 */
+                                goto restart;
+                        }
+                        /*
+                         * This function is never used on a shmem/tmpfs
+                         * mapping, so a swap entry won't be found here.
+                         */
+                        BUG();
+                }
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -1795,7 +1802,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
 static struct page *__read_cache_page(struct address_space *mapping,
                                pgoff_t index,
-                                int (*filler)(void *,struct page*),
+                                int (*filler)(void *, struct page *),
                                void *data,
                                gfp_t gfp)
 {
@@ -1807,7 +1814,7 @@ repeat:
                page = __page_cache_alloc(gfp | __GFP_COLD);
                if (!page)
                        return ERR_PTR(-ENOMEM);
-                err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+                err = add_to_page_cache_lru(page, mapping, index, gfp);
                if (unlikely(err)) {
                        page_cache_release(page);
                        if (err == -EEXIST)
@@ -1826,7 +1833,7 @@ repeat:
 static struct page *do_read_cache_page(struct address_space *mapping,
                                pgoff_t index,
-                                int (*filler)(void *,struct page*),
+                                int (*filler)(void *, struct page *),
                                void *data,
                                gfp_t gfp)
@@ -1866,7 +1873,7 @@ out:
 * @mapping:    the page's address_space
 * @index:      the page index
 * @filler:     function to perform the read
- * @data:       destination for read data
+ * @data:       first arg to filler(data, page) function, often left as NULL
 *
 * Same as read_cache_page, but don't wait for page to become unlocked
 * after submitting it to the filler.
@@ -1878,7 +1885,7 @@ out:
 */
 struct page *read_cache_page_async(struct address_space *mapping,
                                pgoff_t index,
-                                int (*filler)(void *,struct page*),
+                                int (*filler)(void *, struct page *),
                                void *data)
 {
        return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
@@ -1904,10 +1911,7 @@ static struct page *wait_on_page_read(struct page *page)
 * @gfp:        the page allocator flags to use if allocating
 *
 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
- * any new page allocations done using the specified allocation flags. Note
+ * any new page allocations done using the specified allocation flags.
- * that the Radix tree operations will still use GFP_KERNEL, so you can't
- * expect to do this atomically or anything like that - but you can pass in
- * other page requirements.
 *
 * If the page does not get brought uptodate, return -EIO.
 */
@@ -1926,7 +1930,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
 * @mapping:    the page's address_space
 * @index:      the page index
 * @filler:     function to perform the read
- * @data:       destination for read data
+ * @data:       first arg to filler(data, page) function, often left as NULL
 *
 * Read into the page cache. If a page already exists, and PageUptodate() is
 * not set, try to fill the page then wait for it to become unlocked.
@@ -1935,7 +1939,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
 */
 struct page *read_cache_page(struct address_space *mapping,
                                pgoff_t index,
-                                int (*filler)(void *,struct page*),
+                                int (*filler)(void *, struct page *),
                                void *data)
 {
        return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
diff --git a/mm/highmem.c b/mm/highmem.c
index 693394daa2e..5ef672c07f7 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -326,7 +326,7 @@ static struct page_address_slot {
        spinlock_t lock;                        /* Protect this bucket's list */
 } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
-static struct page_address_slot *page_slot(struct page *page)
+static struct page_address_slot *page_slot(const struct page *page)
 {
        return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
 }
@@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page)
 *
 * Returns the page's virtual address.
 */
-void *page_address(struct page *page)
+void *page_address(const struct page *page)
 {
        unsigned long flags;
        void *ret;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 81532f297fd..d819d938288 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -989,7 +989,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        VM_BUG_ON(!PageCompound(page));
        if (flags & FOLL_GET)
-                get_page(page);
+                get_page_foll(page);
 out:
        return page;
@@ -1156,6 +1156,7 @@ static void __split_huge_page_refcount(struct page *page)
        unsigned long head_index = page->index;
        struct zone *zone = page_zone(page);
        int zonestat;
+        int tail_count = 0;
        /* prevent PageLRU to go away from under us, and freeze lru stats */
        spin_lock_irq(&zone->lru_lock);
@@ -1164,11 +1165,27 @@ static void __split_huge_page_refcount(struct page *page)
        for (i = 1; i < HPAGE_PMD_NR; i++) {
                struct page *page_tail = page + i;
-                /* tail_page->_count cannot change */
+                /* tail_page->_mapcount cannot change */
-                atomic_sub(atomic_read(&page_tail->_count), &page->_count);
+                BUG_ON(page_mapcount(page_tail) < 0);
-                BUG_ON(page_count(page) <= 0);
+                tail_count += page_mapcount(page_tail);
-                atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+                /* check for overflow */
-                BUG_ON(atomic_read(&page_tail->_count) <= 0);
+                BUG_ON(tail_count < 0);
+                BUG_ON(atomic_read(&page_tail->_count) != 0);
+                /*
+                 * tail_page->_count is zero and not changing from
+                 * under us. But get_page_unless_zero() may be running
+                 * from under us on the tail_page. If we used
+                 * atomic_set() below instead of atomic_add(), we
+                 * would then run atomic_set() concurrently with
+                 * get_page_unless_zero(), and atomic_set() is
+                 * implemented in C not using locked ops. spin_unlock
+                 * on x86 sometime uses locked ops because of PPro
+                 * errata 66, 92, so unless somebody can guarantee
+                 * atomic_set() here would be safe on all archs (and
+                 * not only on x86), it's safer to use atomic_add().
+                 */
+                atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
+                           &page_tail->_count);
                /* after clearing PageTail the gup refcount can be released */
                smp_mb();
@@ -1186,10 +1203,7 @@ static void __split_huge_page_refcount(struct page *page)
                                      (1L << PG_uptodate)));
                page_tail->flags |= (1L << PG_dirty);
-                /*
+                /* clear PageTail before overwriting first_page */
-                 * 1) clear PageTail before overwriting first_page
-                 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
-                 */
                smp_wmb();
                /*
@@ -1206,7 +1220,6 @@ static void __split_huge_page_refcount(struct page *page)
                 * status is achieved setting a reserved bit in the
                 * pmd, not by clearing the present bit.
                */
-                BUG_ON(page_mapcount(page_tail));
                page_tail->_mapcount = page->_mapcount;
                BUG_ON(page_tail->mapping);
@@ -1223,6 +1236,8 @@ static void __split_huge_page_refcount(struct page *page)
                lru_add_page_tail(zone, page, page_tail);
        }
+        atomic_sub(tail_count, &page->_count);
+        BUG_ON(atomic_read(&page->_count) <= 0);
        __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
        __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
@@ -1596,14 +1611,13 @@ void __khugepaged_exit(struct mm_struct *mm)
                list_del(&mm_slot->mm_node);
                free = 1;
        }
+        spin_unlock(&khugepaged_mm_lock);
        if (free) {
-                spin_unlock(&khugepaged_mm_lock);
                clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
                free_mm_slot(mm_slot);
                mmdrop(mm);
        } else if (mm_slot) {
-                spin_unlock(&khugepaged_mm_lock);
                /*
                 * This is required to serialize against
                 * khugepaged_test_exit() (which is guaranteed to run
@@ -1614,8 +1628,7 @@ void __khugepaged_exit(struct mm_struct *mm)
                 */
                down_write(&mm->mmap_sem);
                up_write(&mm->mmap_sem);
-        } else
+        }
-                spin_unlock(&khugepaged_mm_lock);
 }
 static void release_pte_page(struct page *page)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bfcf153bc82..2316840b337 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,7 +24,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <linux/hugetlb.h>
 #include <linux/node.h>
@@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock);
 * must either hold the mmap_sem for write, or the mmap_sem for read and
 * the hugetlb_instantiation mutex:
 *
- *      down_write(&mm->mmap_sem);
+ *      down_write(&mm->mmap_sem);
 * or
- *      down_read(&mm->mmap_sem);
+ *      down_read(&mm->mmap_sem);
- *      mutex_lock(&hugetlb_instantiation_mutex);
+ *      mutex_lock(&hugetlb_instantiation_mutex);
 */
 struct file_region {
        struct list_head link;
@@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page)
        h->nr_huge_pages--;
        h->nr_huge_pages_node[page_to_nid(page)]--;
        for (i = 0; i < pages_per_huge_page(h); i++) {
-                page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
+                page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
-                                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+                                1 << PG_referenced | 1 << PG_dirty |
-                                1 << PG_private | 1<< PG_writeback);
+                                1 << PG_active | 1 << PG_reserved |
+                                1 << PG_private | 1 << PG_writeback);
        }
        set_compound_page_dtor(page, NULL);
        set_page_refcounted(page);
@@ -575,6 +576,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                __SetPageTail(p);
+                set_page_count(p, 0);
                p->first_page = page;
        }
 }
@@ -591,7 +593,6 @@ int PageHuge(struct page *page)
        return dtor == free_huge_page;
 }
 EXPORT_SYMBOL_GPL(PageHuge);
 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
@@ -900,7 +901,6 @@ retry:
        h->resv_huge_pages += delta;
        ret = 0;
-        spin_unlock(&hugetlb_lock);
        /* Free the needed pages to the hugetlb pool */
        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                if ((--needed) < 0)
@@ -914,6 +914,7 @@ retry:
                VM_BUG_ON(page_count(page));
                enqueue_huge_page(h, page);
        }
+        spin_unlock(&hugetlb_lock);
        /* Free unnecessary surplus pages to the buddy allocator */
 free:
@@ -1105,8 +1106,16 @@ static void __init gather_bootmem_prealloc(void)
        struct huge_bootmem_page *m;
        list_for_each_entry(m, &huge_boot_pages, list) {
-                struct page *page = virt_to_page(m);
                struct hstate *h = m->hstate;
+                struct page *page;
+#ifdef CONFIG_HIGHMEM
+                page = pfn_to_page(m->phys >> PAGE_SHIFT);
+                free_bootmem_late((unsigned long)m,
+                                  sizeof(struct huge_bootmem_page));
+#else
+                page = virt_to_page(m);
+#endif
                __ClearPageReserved(page);
                WARN_ON(page_count(page) != 1);
                prep_compound_huge_page(page, h->order);
@@ -2124,9 +2133,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
        pte_t entry;
        entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
-        if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
+        if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
                update_mmu_cache(vma, address, ptep);
-        }
 }
@@ -2181,9 +2189,9 @@ static int is_hugetlb_entry_migration(pte_t pte)
        if (huge_pte_none(pte) || pte_present(pte))
                return 0;
        swp = pte_to_swp_entry(pte);
-        if (non_swap_entry(swp) && is_migration_entry(swp)) {
+        if (non_swap_entry(swp) && is_migration_entry(swp))
                return 1;
-        } else
+        else
                return 0;
 }
@@ -2194,9 +2202,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
        if (huge_pte_none(pte) || pte_present(pte))
                return 0;
        swp = pte_to_swp_entry(pte);
-        if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
+        if (non_swap_entry(swp) && is_hwpoison_entry(swp))
                return 1;
-        } else
+        else
                return 0;
 }
@@ -2415,6 +2423,8 @@ retry_avoidcopy:
         * anon_vma prepared.
         */
        if (unlikely(anon_vma_prepare(vma))) {
+                page_cache_release(new_page);
+                page_cache_release(old_page);
                /* Caller expects lock to be held */
                spin_lock(&mm->page_table_lock);
                return VM_FAULT_OOM;
@@ -2559,7 +2569,7 @@ retry:
                 * So we need to block hugepage fault by PG_hwpoison bit check.
                 */
                if (unlikely(PageHWPoison(page))) {
-                        ret = VM_FAULT_HWPOISON | 
+                        ret = VM_FAULT_HWPOISON |
                              VM_FAULT_SET_HINDEX(h - hstates);
                        goto backout_unlocked;
                }
@@ -2627,7 +2637,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        migration_entry_wait(mm, (pmd_t *)ptep, address);
                        return 0;
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
-                        return VM_FAULT_HWPOISON_LARGE | 
+                        return VM_FAULT_HWPOISON_LARGE |
                               VM_FAULT_SET_HINDEX(h - hstates);
        }
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 4019979b263..a56a851908d 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -5,7 +5,7 @@
 #include <linux/list.h>
 #include <linux/cpumask.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
diff --git a/mm/internal.h b/mm/internal.h
index d071d380fb4..2189af49178 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)
        atomic_dec(&page->_count);
 }
+static inline void __get_page_tail_foll(struct page *page,
+                                        bool get_page_head)
+{
+        /*
+         * If we're getting a tail page, the elevated page->_count is
+         * required only in the head page and we will elevate the head
+         * page->_count and tail page->_mapcount.
+         *
+         * We elevate page_tail->_mapcount for tail pages to force
+         * page_tail->_count to be zero at all times to avoid getting
+         * false positives from get_page_unless_zero() with
+         * speculative page access (like in
+         * page_cache_get_speculative()) on tail pages.
+         */
+        VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+        VM_BUG_ON(atomic_read(&page->_count) != 0);
+        VM_BUG_ON(page_mapcount(page) < 0);
+        if (get_page_head)
+                atomic_inc(&page->first_page->_count);
+        atomic_inc(&page->_mapcount);
+}
+/*
+ * This is meant to be called as the FOLL_GET operation of
+ * follow_page() and it must be called while holding the proper PT
+ * lock while the pte (or pmd_trans_huge) is still mapping the page.
+ */
+static inline void get_page_foll(struct page *page)
+{
+        if (unlikely(PageTail(page)))
+                /*
+                 * This is safe only because
+                 * __split_huge_page_refcount() can't run under
+                 * get_page_foll() because we hold the proper PT lock.
+                 */
+                __get_page_tail_foll(page, true);
+        else {
+                /*
+                 * Getting a normal page or the head of a compound page
+                 * requires to already have an elevated page->_count.
+                 */
+                VM_BUG_ON(atomic_read(&page->_count) <= 0);
+                atomic_inc(&page->_count);
+        }
+}
 extern unsigned long highest_memmap_pfn;
 /*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index aacee45616f..d6880f542f9 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -96,7 +96,7 @@
 #include <asm/sections.h>
 #include <asm/processor.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/kmemcheck.h>
 #include <linux/kmemleak.h>
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491ed50..74bf193eff0 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma,
        endoff = (loff_t)(end - vma->vm_start - 1)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-        /* vmtruncate_range needs to take i_mutex and i_alloc_sem */
+        /* vmtruncate_range needs to take i_mutex */
        up_read(&current->mm->mmap_sem);
        error = vmtruncate_range(mapping->host, offset, endoff);
        down_read(&current->mm->mmap_sem);
diff --git a/mm/memblock.c b/mm/memblock.c
index a0562d1a6ad..ccbf9733959 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -758,9 +758,9 @@ void __init memblock_analyze(void)
        /* Check marker in the unused last array entry */
        WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
-                != (phys_addr_t)RED_INACTIVE);
+                != MEMBLOCK_INACTIVE);
        WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
-                != (phys_addr_t)RED_INACTIVE);
+                != MEMBLOCK_INACTIVE);
        memblock.memory_size = 0;
@@ -786,8 +786,8 @@ void __init memblock_init(void)
        memblock.reserved.max   = INIT_MEMBLOCK_REGIONS;
        /* Write a marker in the unused last array entry */
-        memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+        memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
-        memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+        memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
        /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
         * This simplifies the memblock_add() code below...
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e013b8e57d2..dd81ddc64b4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,7 +35,6 @@
 #include <linux/limits.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
-#include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
@@ -246,10 +245,13 @@ struct mem_cgroup {
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;
-        atomic_t        oom_lock;
+        bool            oom_lock;
+        atomic_t        under_oom;
        atomic_t        refcnt;
-        unsigned int    swappiness;
+        int     swappiness;
        /* OOM-Killer disable */
        int             oom_kill_disable;
@@ -636,27 +638,44 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        preempt_enable();
 }
-static unsigned long
+unsigned long
-mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
+                        unsigned int lru_mask)
 {
        struct mem_cgroup_per_zone *mz;
+        enum lru_list l;
+        unsigned long ret = 0;
+        mz = mem_cgroup_zoneinfo(mem, nid, zid);
+        for_each_lru(l) {
+                if (BIT(l) & lru_mask)
+                        ret += MEM_CGROUP_ZSTAT(mz, l);
+        }
+        return ret;
+}
+static unsigned long
+mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
+                        int nid, unsigned int lru_mask)
+{
        u64 total = 0;
        int zid;
-        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+        for (zid = 0; zid < MAX_NR_ZONES; zid++)
-                mz = mem_cgroup_zoneinfo(mem, nid, zid);
+                total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
-                total += MEM_CGROUP_ZSTAT(mz, idx);
-        }
        return total;
 }
-static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
-                                        enum lru_list idx)
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
+                        unsigned int lru_mask)
 {
        int nid;
        u64 total = 0;
-        for_each_online_node(nid)
+        for_each_node_state(nid, N_HIGH_MEMORY)
-                total += mem_cgroup_get_zonestat_node(mem, nid, idx);
+                total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
        return total;
 }
@@ -1043,6 +1062,21 @@ void mem_cgroup_move_lists(struct page *page,
        mem_cgroup_add_lru_list(page, to);
 }
+/*
+ * Checks whether given mem is same or in the root_mem's
+ * hierarchy subtree
+ */
+static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
+                struct mem_cgroup *mem)
+{
+        if (root_mem != mem) {
+                return (root_mem->use_hierarchy &&
+                        css_is_ancestor(&mem->css, &root_mem->css));
+        }
+        return true;
+}
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
        int ret;
@@ -1062,10 +1096,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
         * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
         * hierarchy(even if use_hierarchy is disabled in "mem").
         */
-        if (mem->use_hierarchy)
+        ret = mem_cgroup_same_or_subtree(mem, curr);
-                ret = css_is_ancestor(&curr->css, &mem->css);
-        else
-                ret = (curr == mem);
        css_put(&curr->css);
        return ret;
 }
@@ -1077,8 +1108,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
        unsigned long gb;
        unsigned long inactive_ratio;
-        inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
+        inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
-        active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
+        active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
        gb = (inactive + active) >> (30 - PAGE_SHIFT);
        if (gb)
@@ -1117,109 +1148,12 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
        unsigned long active;
        unsigned long inactive;
-        inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
+        inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
-        active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
+        active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
        return (active > inactive);
 }
-unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
-                                                struct zone *zone,
-                                                enum lru_list lru)
-{
-        int nid = zone_to_nid(zone);
-        int zid = zone_idx(zone);
-        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-        return MEM_CGROUP_ZSTAT(mz, lru);
-}
-static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
-                                                        int nid)
-{
-        unsigned long ret;
-        ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) +
-                mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE);
-        return ret;
-}
-static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
-                                                        int nid)
-{
-        unsigned long ret;
-        ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
-                mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
-        return ret;
-}
-#if MAX_NUMNODES > 1
-static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
-{
-        u64 total = 0;
-        int nid;
-        for_each_node_state(nid, N_HIGH_MEMORY)
-                total += mem_cgroup_node_nr_file_lru_pages(memcg, nid);
-        return total;
-}
-static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
-{
-        u64 total = 0;
-        int nid;
-        for_each_node_state(nid, N_HIGH_MEMORY)
-                total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid);
-        return total;
-}
-static unsigned long
-mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid)
-{
-        return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE);
-}
-static unsigned long
-mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg)
-{
-        u64 total = 0;
-        int nid;
-        for_each_node_state(nid, N_HIGH_MEMORY)
-                total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid);
-        return total;
-}
-static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
-                                                        int nid)
-{
-        enum lru_list l;
-        u64 total = 0;
-        for_each_lru(l)
-                total += mem_cgroup_get_zonestat_node(memcg, nid, l);
-        return total;
-}
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg)
-{
-        u64 total = 0;
-        int nid;
-        for_each_node_state(nid, N_HIGH_MEMORY)
-                total += mem_cgroup_node_nr_lru_pages(memcg, nid);
-        return total;
-}
-#endif /* CONFIG_NUMA */
 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
                                                      struct zone *zone)
 {
@@ -1329,7 +1263,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
        return margin >> PAGE_SHIFT;
 }
-static unsigned int get_swappiness(struct mem_cgroup *memcg)
+int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
        struct cgroup *cgrp = memcg->css.cgroup;
@@ -1401,10 +1335,9 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)
        to = mc.to;
        if (!from)
                goto unlock;
-        if (from == mem || to == mem
-            || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
+        ret = mem_cgroup_same_or_subtree(mem, from)
-            || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
+                || mem_cgroup_same_or_subtree(mem, to);
-                ret = true;
 unlock:
        spin_unlock(&mc.lock);
        return ret;
@@ -1576,11 +1509,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
                int nid, bool noswap)
 {
-        if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
+        if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
                return true;
        if (noswap || !total_swap_pages)
                return false;
-        if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
+        if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
                return true;
        return false;
@@ -1730,7 +1663,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
        /* If memsw_is_minimum==1, swap-out is of-no-use. */
-        if (!check_soft && root_mem->memsw_is_minimum)
+        if (!check_soft && !shrink && root_mem->memsw_is_minimum)
                noswap = true;
        while (1) {
@@ -1776,12 +1709,11 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                /* we use swappiness of local cgroup */
                if (check_soft) {
                        ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-                                noswap, get_swappiness(victim), zone,
+                                noswap, zone, &nr_scanned);
-                                &nr_scanned);
                        *total_scanned += nr_scanned;
                } else
                        ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
-                                                noswap, get_swappiness(victim));
+                                                noswap);
                css_put(&victim->css);
                /*
                 * At shrinking usage, we can't check we should stop here or
@@ -1803,38 +1735,77 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 /*
 * Check OOM-Killer is already running under our hierarchy.
 * If someone is running, return false.
+ * Has to be called with memcg_oom_lock
 */
 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
 {
-        int x, lock_count = 0;
+        struct mem_cgroup *iter, *failed = NULL;
-        struct mem_cgroup *iter;
+        bool cond = true;
-        for_each_mem_cgroup_tree(iter, mem) {
+        for_each_mem_cgroup_tree_cond(iter, mem, cond) {
-                x = atomic_inc_return(&iter->oom_lock);
+                if (iter->oom_lock) {
-                lock_count = max(x, lock_count);
+                        /*
+                         * this subtree of our hierarchy is already locked
+                         * so we cannot give a lock.
+                         */
+                        failed = iter;
+                        cond = false;
+                } else
+                        iter->oom_lock = true;
        }
-        if (lock_count == 1)
+        if (!failed)
                return true;
+        /*
+         * OK, we failed to lock the whole subtree so we have to clean up
+         * what we set up to the failing subtree
+         */
+        cond = true;
+        for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+                if (iter == failed) {
+                        cond = false;
+                        continue;
+                }
+                iter->oom_lock = false;
+        }
        return false;
 }
+/*
+ * Has to be called with memcg_oom_lock
+ */
 static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
 {
        struct mem_cgroup *iter;
+        for_each_mem_cgroup_tree(iter, mem)
+                iter->oom_lock = false;
+        return 0;
+}
+static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
+{
+        struct mem_cgroup *iter;
+        for_each_mem_cgroup_tree(iter, mem)
+                atomic_inc(&iter->under_oom);
+}
+static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
+{
+        struct mem_cgroup *iter;
        /*
         * When a new child is created while the hierarchy is under oom,
         * mem_cgroup_oom_lock() may not be called. We have to use
         * atomic_add_unless() here.
         */
        for_each_mem_cgroup_tree(iter, mem)
-                atomic_add_unless(&iter->oom_lock, -1, 0);
+                atomic_add_unless(&iter->under_oom, -1, 0);
-        return 0;
 }
+static DEFINE_SPINLOCK(memcg_oom_lock);
-static DEFINE_MUTEX(memcg_oom_mutex);
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 struct oom_wait_info {
@@ -1845,25 +1816,20 @@ struct oom_wait_info {
 static int memcg_oom_wake_function(wait_queue_t *wait,
        unsigned mode, int sync, void *arg)
 {
-        struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
+        struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
+                          *oom_wait_mem;
        struct oom_wait_info *oom_wait_info;
        oom_wait_info = container_of(wait, struct oom_wait_info, wait);
+        oom_wait_mem = oom_wait_info->mem;
-        if (oom_wait_info->mem == wake_mem)
-                goto wakeup;
-        /* if no hierarchy, no match */
-        if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
-                return 0;
        /*
         * Both of oom_wait_info->mem and wake_mem are stable under us.
         * Then we can use css_is_ancestor without taking care of RCU.
         */
-        if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
+        if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
-            !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
+                        && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
                return 0;
-wakeup:
        return autoremove_wake_function(wait, mode, sync, arg);
 }
@@ -1875,7 +1841,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
 static void memcg_oom_recover(struct mem_cgroup *mem)
 {
-        if (mem && atomic_read(&mem->oom_lock))
+        if (mem && atomic_read(&mem->under_oom))
                memcg_wakeup_oom(mem);
 }
@@ -1893,8 +1859,10 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
        owait.wait.private = current;
        INIT_LIST_HEAD(&owait.wait.task_list);
        need_to_kill = true;
+        mem_cgroup_mark_under_oom(mem);
        /* At first, try to OOM lock hierarchy under mem.*/
-        mutex_lock(&memcg_oom_mutex);
+        spin_lock(&memcg_oom_lock);
        locked = mem_cgroup_oom_lock(mem);
        /*
         * Even if signal_pending(), we can't quit charge() loop without
@@ -1906,7 +1874,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
                need_to_kill = false;
        if (locked)
                mem_cgroup_oom_notify(mem);
-        mutex_unlock(&memcg_oom_mutex);
+        spin_unlock(&memcg_oom_lock);
        if (need_to_kill) {
                finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1915,10 +1883,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
                schedule();
                finish_wait(&memcg_oom_waitq, &owait.wait);
        }
-        mutex_lock(&memcg_oom_mutex);
+        spin_lock(&memcg_oom_lock);
-        mem_cgroup_oom_unlock(mem);
+        if (locked)
+                mem_cgroup_oom_unlock(mem);
        memcg_wakeup_oom(mem);
-        mutex_unlock(&memcg_oom_mutex);
+        spin_unlock(&memcg_oom_lock);
+        mem_cgroup_unmark_under_oom(mem);
        if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
                return false;
@@ -2079,59 +2050,70 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
 }
 /*
- * Tries to drain stocked charges in other cpus. This function is asynchronous
+ * Drains all per-CPU charge caches for given root_mem resp. subtree
- * and just put a work per cpu for draining localy on each cpu. Caller can
+ * of the hierarchy under it. sync flag says whether we should block
- * expects some charges will be back to res_counter later but cannot wait for
+ * until the work is done.
- * it.
 */
-static void drain_all_stock_async(struct mem_cgroup *root_mem)
+static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
 {
        int cpu, curcpu;
-        /*
-         * If someone calls draining, avoid adding more kworker runs.
-         */
-        if (!mutex_trylock(&percpu_charge_mutex))
-                return;
        /* Notify other cpus that system-wide "drain" is running */
        get_online_cpus();
-        /*
+        curcpu = get_cpu();
-         * Get a hint for avoiding draining charges on the current cpu,
-         * which must be exhausted by our charging.  It is not required that
-         * this be a precise check, so we use raw_smp_processor_id() instead of
-         * getcpu()/putcpu().
-         */
-        curcpu = raw_smp_processor_id();
        for_each_online_cpu(cpu) {
                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
                struct mem_cgroup *mem;
-                if (cpu == curcpu)
-                        continue;
                mem = stock->cached;
-                if (!mem)
+                if (!mem || !stock->nr_pages)
+                        continue;
+                if (!mem_cgroup_same_or_subtree(root_mem, mem))
                        continue;
-                if (mem != root_mem) {
+                if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
-                        if (!root_mem->use_hierarchy)
+                        if (cpu == curcpu)
-                                continue;
+                                drain_local_stock(&stock->work);
-                        /* check whether "mem" is under tree of "root_mem" */
+                        else
-                        if (!css_is_ancestor(&mem->css, &root_mem->css))
+                                schedule_work_on(cpu, &stock->work);
-                                continue;
                }
-                if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
-                        schedule_work_on(cpu, &stock->work);
        }
+        put_cpu();
+        if (!sync)
+                goto out;
+        for_each_online_cpu(cpu) {
+                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
+                if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+                        flush_work(&stock->work);
+        }
+out:
        put_online_cpus();
+}
+/*
+ * Tries to drain stocked charges in other cpus. This function is asynchronous
+ * and just put a work per cpu for draining localy on each cpu. Caller can
+ * expects some charges will be back to res_counter later but cannot wait for
+ * it.
+ */
+static void drain_all_stock_async(struct mem_cgroup *root_mem)
+{
+        /*
+         * If someone calls draining, avoid adding more kworker runs.
+         */
+        if (!mutex_trylock(&percpu_charge_mutex))
+                return;
+        drain_all_stock(root_mem, false);
        mutex_unlock(&percpu_charge_mutex);
-        /* We don't wait for flush_work */
 }
 /* This is a synchronous drain interface. */
-static void drain_all_stock_sync(void)
+static void drain_all_stock_sync(struct mem_cgroup *root_mem)
 {
        /* called when force_empty is called */
        mutex_lock(&percpu_charge_mutex);
-        schedule_on_each_cpu(drain_local_stock);
+        drain_all_stock(root_mem, true);
        mutex_unlock(&percpu_charge_mutex);
 }
@@ -2784,30 +2766,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                return 0;
        if (PageCompound(page))
                return 0;
-        /*
-         * Corner case handling. This is called from add_to_page_cache()
-         * in usual. But some FS (shmem) precharges this page before calling it
-         * and call add_to_page_cache() with GFP_NOWAIT.
-         *
-         * For GFP_NOWAIT case, the page may be pre-charged before calling
-         * add_to_page_cache(). (See shmem.c) check it here and avoid to call
-         * charge twice. (It works but has to pay a bit larger cost.)
-         * And when the page is SwapCache, it should take swap information
-         * into account. This is under lock_page() now.
-         */
-        if (!(gfp_mask & __GFP_WAIT)) {
-                struct page_cgroup *pc;
-                pc = lookup_page_cgroup(page);
-                if (!pc)
-                        return 0;
-                lock_page_cgroup(pc);
-                if (PageCgroupUsed(pc)) {
-                        unlock_page_cgroup(pc);
-                        return 0;
-                }
-                unlock_page_cgroup(pc);
-        }
        if (unlikely(!mm))
                mm = &init_mm;
@@ -3398,28 +3356,47 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
 }
 /*
- * A call to try to shrink memory usage on charge failure at shmem's swapin.
+ * At replace page cache, newpage is not under any memcg but it's on
- * Calling hierarchical_reclaim is not enough because we should update
+ * LRU. So, this function doesn't touch res_counter but handles LRU
- * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
+ * in correct way. Both pages are locked so we cannot race with uncharge.
- * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
- * not from the memcg which this page would be charged to.
- * try_charge_swapin does all of these works properly.
 */
-int mem_cgroup_shmem_charge_fallback(struct page *page,
+void mem_cgroup_replace_page_cache(struct page *oldpage,
-                            struct mm_struct *mm,
+                                  struct page *newpage)
-                            gfp_t gfp_mask)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *memcg;
-        int ret;
+        struct page_cgroup *pc;
+        struct zone *zone;
+        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+        unsigned long flags;
        if (mem_cgroup_disabled())
-                return 0;
+                return;
-        ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
+        pc = lookup_page_cgroup(oldpage);
-        if (!ret)
+        /* fix accounting on old pages */
-                mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
+        lock_page_cgroup(pc);
+        memcg = pc->mem_cgroup;
+        mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
+        ClearPageCgroupUsed(pc);
+        unlock_page_cgroup(pc);
-        return ret;
+        if (PageSwapBacked(oldpage))
+                type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+        zone = page_zone(newpage);
+        pc = lookup_page_cgroup(newpage);
+        /*
+         * Even if newpage->mapping was NULL before starting replacement,
+         * the newpage may be on LRU(or pagevec for LRU) already. We lock
+         * LRU while we overwrite pc->mem_cgroup.
+         */
+        spin_lock_irqsave(&zone->lru_lock, flags);
+        if (PageLRU(newpage))
+                del_page_from_lru_list(zone, newpage, page_lru(newpage));
+        __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type);
+        if (PageLRU(newpage))
+                add_page_to_lru_list(zone, newpage, page_lru(newpage));
+        spin_unlock_irqrestore(&zone->lru_lock, flags);
 }
 #ifdef CONFIG_DEBUG_VM
@@ -3780,7 +3757,7 @@ move_account:
                        goto out;
                /* This is for making all *used* pages to be on LRU. */
                lru_add_drain_all();
-                drain_all_stock_sync();
+                drain_all_stock_sync(mem);
                ret = 0;
                mem_cgroup_start_move(mem);
                for_each_node_state(node, N_HIGH_MEMORY) {
@@ -3826,7 +3803,7 @@ try_to_free:
                        goto out;
                }
                progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
-                                                false, get_swappiness(mem));
+                                                false);
                if (!progress) {
                        nr_retries--;
                        /* maybe some writeback is necessary */
@@ -4152,15 +4129,15 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
        s->stat[MCS_PGMAJFAULT] += val;
        /* per zone stat */
-        val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
+        val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
        s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
-        val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
+        val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
        s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
-        val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
+        val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
        s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
-        val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
+        val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
        s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
-        val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
+        val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
        s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
 }
@@ -4182,35 +4159,37 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
        struct cgroup *cont = m->private;
        struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
-        total_nr = mem_cgroup_nr_lru_pages(mem_cont);
+        total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
        seq_printf(m, "total=%lu", total_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-                node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid);
+                node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
        seq_putc(m, '\n');
-        file_nr = mem_cgroup_nr_file_lru_pages(mem_cont);
+        file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
        seq_printf(m, "file=%lu", file_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-                node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid);
+                node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+                                LRU_ALL_FILE);
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
        seq_putc(m, '\n');
-        anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont);
+        anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
        seq_printf(m, "anon=%lu", anon_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-                node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid);
+                node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
+                                LRU_ALL_ANON);
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
        seq_putc(m, '\n');
-        unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont);
+        unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
        seq_printf(m, "unevictable=%lu", unevictable_nr);
        for_each_node_state(nid, N_HIGH_MEMORY) {
-                node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont,
+                node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
-                                                                        nid);
+                                BIT(LRU_UNEVICTABLE));
                seq_printf(m, " N%d=%lu", nid, node_nr);
        }
        seq_putc(m, '\n');
@@ -4288,7 +4267,7 @@ static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
-        return get_swappiness(memcg);
+        return mem_cgroup_swappiness(memcg);
 }
 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
@@ -4578,15 +4557,15 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
        if (!event)
                return -ENOMEM;
-        mutex_lock(&memcg_oom_mutex);
+        spin_lock(&memcg_oom_lock);
        event->eventfd = eventfd;
        list_add(&event->list, &memcg->oom_notify);
        /* already in OOM ? */
-        if (atomic_read(&memcg->oom_lock))
+        if (atomic_read(&memcg->under_oom))
                eventfd_signal(eventfd, 1);
-        mutex_unlock(&memcg_oom_mutex);
+        spin_unlock(&memcg_oom_lock);
        return 0;
 }
@@ -4600,7 +4579,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
        BUG_ON(type != _OOM_TYPE);
-        mutex_lock(&memcg_oom_mutex);
+        spin_lock(&memcg_oom_lock);
        list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
                if (ev->eventfd == eventfd) {
@@ -4609,7 +4588,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
                }
        }
-        mutex_unlock(&memcg_oom_mutex);
+        spin_unlock(&memcg_oom_lock);
 }
 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
@@ -4619,7 +4598,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
        cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
-        if (atomic_read(&mem->oom_lock))
+        if (atomic_read(&mem->under_oom))
                cb->fill(cb, "under_oom", 1);
        else
                cb->fill(cb, "under_oom", 0);
@@ -4963,9 +4942,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                int cpu;
                enable_swap_cgroup();
                parent = NULL;
-                root_mem_cgroup = mem;
                if (mem_cgroup_soft_limit_tree_init())
                        goto free_out;
+                root_mem_cgroup = mem;
                for_each_possible_cpu(cpu) {
                        struct memcg_stock_pcp *stock =
                                                &per_cpu(memcg_stock, cpu);
@@ -4997,14 +4976,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
        INIT_LIST_HEAD(&mem->oom_notify);
        if (parent)
-                mem->swappiness = get_swappiness(parent);
+                mem->swappiness = mem_cgroup_swappiness(parent);
        atomic_set(&mem->refcnt, 1);
        mem->move_charge_at_immigrate = 0;
        mutex_init(&mem->thresholds_lock);
        return &mem->css;
 free_out:
        __mem_cgroup_free(mem);
-        root_mem_cgroup = NULL;
        return ERR_PTR(error);
 }
@@ -5181,15 +5159,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
                pgoff = pte_to_pgoff(ptent);
        /* page is moved even if it's not RSS of this task(page-faulted). */
-        if (!mapping_cap_swap_backed(mapping)) { /* normal file */
+        page = find_get_page(mapping, pgoff);
-                page = find_get_page(mapping, pgoff);
-        } else { /* shmem/tmpfs file. we should take account of swap too. */
+#ifdef CONFIG_SWAP
-                swp_entry_t ent;
+        /* shmem/tmpfs may report page out on swap: account for that too. */
-                mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
+        if (radix_tree_exceptional_entry(page)) {
+                swp_entry_t swap = radix_to_swp_entry(page);
                if (do_swap_account)
-                        entry->val = ent.val;
+                        *entry = swap;
+                page = find_get_page(&swapper_space, swap.val);
        }
+#endif
        return page;
 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 740c4f52059..2b43ba051ac 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -53,6 +53,7 @@
 #include <linux/hugetlb.h>
 #include <linux/memory_hotplug.h>
 #include <linux/mm_inline.h>
+#include <linux/kfifo.h>
 #include "internal.h"
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1178,6 +1179,97 @@ void memory_failure(unsigned long pfn, int trapno)
        __memory_failure(pfn, trapno, 0);
 }
+#define MEMORY_FAILURE_FIFO_ORDER       4
+#define MEMORY_FAILURE_FIFO_SIZE        (1 << MEMORY_FAILURE_FIFO_ORDER)
+struct memory_failure_entry {
+        unsigned long pfn;
+        int trapno;
+        int flags;
+};
+struct memory_failure_cpu {
+        DECLARE_KFIFO(fifo, struct memory_failure_entry,
+                      MEMORY_FAILURE_FIFO_SIZE);
+        spinlock_t lock;
+        struct work_struct work;
+};
+static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
+/**
+ * memory_failure_queue - Schedule handling memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ * @flags: Flags for memory failure handling
+ *
+ * This function is called by the low level hardware error handler
+ * when it detects hardware memory corruption of a page. It schedules
+ * the recovering of error page, including dropping pages, killing
+ * processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Can run in IRQ context.
+ */
+void memory_failure_queue(unsigned long pfn, int trapno, int flags)
+{
+        struct memory_failure_cpu *mf_cpu;
+        unsigned long proc_flags;
+        struct memory_failure_entry entry = {
+                .pfn =          pfn,
+                .trapno =       trapno,
+                .flags =        flags,
+        };
+        mf_cpu = &get_cpu_var(memory_failure_cpu);
+        spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+        if (kfifo_put(&mf_cpu->fifo, &entry))
+                schedule_work_on(smp_processor_id(), &mf_cpu->work);
+        else
+                pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
+                       pfn);
+        spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+        put_cpu_var(memory_failure_cpu);
+}
+EXPORT_SYMBOL_GPL(memory_failure_queue);
+static void memory_failure_work_func(struct work_struct *work)
+{
+        struct memory_failure_cpu *mf_cpu;
+        struct memory_failure_entry entry = { 0, };
+        unsigned long proc_flags;
+        int gotten;
+        mf_cpu = &__get_cpu_var(memory_failure_cpu);
+        for (;;) {
+                spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+                gotten = kfifo_get(&mf_cpu->fifo, &entry);
+                spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+                if (!gotten)
+                        break;
+                __memory_failure(entry.pfn, entry.trapno, entry.flags);
+        }
+}
+static int __init memory_failure_init(void)
+{
+        struct memory_failure_cpu *mf_cpu;
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+                spin_lock_init(&mf_cpu->lock);
+                INIT_KFIFO(mf_cpu->fifo);
+                INIT_WORK(&mf_cpu->work, memory_failure_work_func);
+        }
+        return 0;
+}
+core_initcall(memory_failure_init);
 /**
 * unpoison_memory - Unpoison a previously poisoned page
 * @pfn: Page number of the to be unpoisoned page
diff --git a/mm/memory.c b/mm/memory.c
index 9b8a01d941c..b2b87315cdc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1290,13 +1290,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
        return addr;
 }
-#ifdef CONFIG_PREEMPT
-# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
-#else
-/* No preempt: go for improved straight-line efficiency */
-# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
-#endif
 /**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
 * @tlb: address of the caller's struct mmu_gather
@@ -1310,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
 *
 * Unmap all pages in the vma list.
 *
- * We aim to not hold locks for too long (for scheduling latency reasons).
- * So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
- * return the ending mmu_gather to the caller.
- *
 * Only addresses between `start' and `end' will be unmapped.
 *
 * The VMA list must be sorted in ascending virtual address order.
@@ -1514,7 +1503,7 @@ split_fallthrough:
        }
        if (flags & FOLL_GET)
-                get_page(page);
+                get_page_foll(page);
        if (flags & FOLL_TOUCH) {
                if ((flags & FOLL_WRITE) &&
                    !pte_dirty(pte) && !PageDirty(page))
@@ -1816,7 +1805,63 @@ next_page:
 }
 EXPORT_SYMBOL(__get_user_pages);
-/**
+/*
+ * fixup_user_fault() - manually resolve a user page fault
+ * @tsk:        the task_struct to use for page fault accounting, or
+ *              NULL if faults are not to be recorded.
+ * @mm:         mm_struct of target mm
+ * @address:    user address
+ * @fault_flags:flags to pass down to handle_mm_fault()
+ *
+ * This is meant to be called in the specific scenario where for locking reasons
+ * we try to access user memory in atomic context (within a pagefault_disable()
+ * section), this returns -EFAULT, and we want to resolve the user fault before
+ * trying again.
+ *
+ * Typically this is meant to be used by the futex code.
+ *
+ * The main difference with get_user_pages() is that this function will
+ * unconditionally call handle_mm_fault() which will in turn perform all the
+ * necessary SW fixup of the dirty and young bits in the PTE, while
+ * handle_mm_fault() only guarantees to update these in the struct page.
+ *
+ * This is important for some architectures where those bits also gate the
+ * access permission to the page because they are maintained in software.  On
+ * such architectures, gup() will not be enough to make a subsequent access
+ * succeed.
+ *
+ * This should be called with the mm_sem held for read.
+ */
+int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+                     unsigned long address, unsigned int fault_flags)
+{
+        struct vm_area_struct *vma;
+        int ret;
+        vma = find_extend_vma(mm, address);
+        if (!vma || address < vma->vm_start)
+                return -EFAULT;
+        ret = handle_mm_fault(mm, vma, address, fault_flags);
+        if (ret & VM_FAULT_ERROR) {
+                if (ret & VM_FAULT_OOM)
+                        return -ENOMEM;
+                if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+                        return -EHWPOISON;
+                if (ret & VM_FAULT_SIGBUS)
+                        return -EFAULT;
+                BUG();
+        }
+        if (tsk) {
+                if (ret & VM_FAULT_MAJOR)
+                        tsk->maj_flt++;
+                else
+                        tsk->min_flt++;
+        }
+        return 0;
+}
+/*
 * get_user_pages() - pin user pages in memory
 * @tsk:        the task_struct to use for page fault accounting, or
 *              NULL if faults are not to be recorded.
@@ -3104,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t *page_table;
        spinlock_t *ptl;
        struct page *page;
+        struct page *cow_page;
        pte_t entry;
        int anon = 0;
-        int charged = 0;
        struct page *dirty_page = NULL;
        struct vm_fault vmf;
        int ret;
        int page_mkwrite = 0;
+        /*
+         * If we do COW later, allocate page befor taking lock_page()
+         * on the file cache page. This will reduce lock holding time.
+         */
+        if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+                if (unlikely(anon_vma_prepare(vma)))
+                        return VM_FAULT_OOM;
+                cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+                if (!cow_page)
+                        return VM_FAULT_OOM;
+                if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
+                        page_cache_release(cow_page);
+                        return VM_FAULT_OOM;
+                }
+        } else
+                cow_page = NULL;
        vmf.virtual_address = (void __user *)(address & PAGE_MASK);
        vmf.pgoff = pgoff;
        vmf.flags = flags;
@@ -3120,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        ret = vma->vm_ops->fault(vma, &vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
                            VM_FAULT_RETRY)))
-                return ret;
+                goto uncharge_out;
        if (unlikely(PageHWPoison(vmf.page))) {
                if (ret & VM_FAULT_LOCKED)
                        unlock_page(vmf.page);
-                return VM_FAULT_HWPOISON;
+                ret = VM_FAULT_HWPOISON;
+                goto uncharge_out;
        }
        /*
@@ -3143,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        page = vmf.page;
        if (flags & FAULT_FLAG_WRITE) {
                if (!(vma->vm_flags & VM_SHARED)) {
+                        page = cow_page;
                        anon = 1;
-                        if (unlikely(anon_vma_prepare(vma))) {
-                                ret = VM_FAULT_OOM;
-                                goto out;
-                        }
-                        page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
-                                                vma, address);
-                        if (!page) {
-                                ret = VM_FAULT_OOM;
-                                goto out;
-                        }
-                        if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
-                                ret = VM_FAULT_OOM;
-                                page_cache_release(page);
-                                goto out;
-                        }
-                        charged = 1;
                        copy_user_highpage(page, vmf.page, address, vma);
                        __SetPageUptodate(page);
                } else {
@@ -3228,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                /* no need to invalidate: a not-present page won't be cached */
                update_mmu_cache(vma, address, page_table);
        } else {
-                if (charged)
+                if (cow_page)
-                        mem_cgroup_uncharge_page(page);
+                        mem_cgroup_uncharge_page(cow_page);
                if (anon)
                        page_cache_release(page);
                else
@@ -3238,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_unmap_unlock(page_table, ptl);
-out:
        if (dirty_page) {
                struct address_space *mapping = page->mapping;
@@ -3268,6 +3318,13 @@ out:
 unwritable_page:
        page_cache_release(page);
        return ret;
+uncharge_out:
+        /* fs's fault handler get error */
+        if (cow_page) {
+                mem_cgroup_uncharge_page(cow_page);
+                page_cache_release(cow_page);
+        }
+        return ret;
 }
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c46887b5a11..6e7d8b21dbf 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,17 @@
 #include "internal.h"
+/*
+ * online_page_callback contains pointer to current page onlining function.
+ * Initially it is generic_online_page(). If it is required it could be
+ * changed by calling set_online_page_callback() for callback registration
+ * and restore_online_page_callback() for generic callback restore.
+ */
+static void generic_online_page(struct page *page);
+static online_page_callback_t online_page_callback = generic_online_page;
 DEFINE_MUTEX(mem_hotplug_mutex);
 void lock_memory_hotplug(void)
@@ -361,23 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 }
 EXPORT_SYMBOL_GPL(__remove_pages);
-void online_page(struct page *page)
+int set_online_page_callback(online_page_callback_t callback)
+{
+        int rc = -EINVAL;
+        lock_memory_hotplug();
+        if (online_page_callback == generic_online_page) {
+                online_page_callback = callback;
+                rc = 0;
+        }
+        unlock_memory_hotplug();
+        return rc;
+}
+EXPORT_SYMBOL_GPL(set_online_page_callback);
+int restore_online_page_callback(online_page_callback_t callback)
+{
+        int rc = -EINVAL;
+        lock_memory_hotplug();
+        if (online_page_callback == callback) {
+                online_page_callback = generic_online_page;
+                rc = 0;
+        }
+        unlock_memory_hotplug();
+        return rc;
+}
+EXPORT_SYMBOL_GPL(restore_online_page_callback);
+void __online_page_set_limits(struct page *page)
 {
        unsigned long pfn = page_to_pfn(page);
-        totalram_pages++;
        if (pfn >= num_physpages)
                num_physpages = pfn + 1;
+}
+EXPORT_SYMBOL_GPL(__online_page_set_limits);
+void __online_page_increment_counters(struct page *page)
+{
+        totalram_pages++;
 #ifdef CONFIG_HIGHMEM
        if (PageHighMem(page))
                totalhigh_pages++;
 #endif
+}
+EXPORT_SYMBOL_GPL(__online_page_increment_counters);
+void __online_page_free(struct page *page)
+{
        ClearPageReserved(page);
        init_page_count(page);
        __free_page(page);
 }
+EXPORT_SYMBOL_GPL(__online_page_free);
+static void generic_online_page(struct page *page)
+{
+        __online_page_set_limits(page);
+        __online_page_increment_counters(page);
+        __online_page_free(page);
+}
 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
                        void *arg)
@@ -388,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
        if (PageReserved(pfn_to_page(start_pfn)))
                for (i = 0; i < nr_pages; i++) {
                        page = pfn_to_page(start_pfn + i);
-                        online_page(page);
+                        (*online_page_callback)(page);
                        onlined_pages++;
                }
        *(unsigned long *)arg = onlined_pages;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e7fb9d25c54..2775fd04924 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,6 +93,7 @@
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
+#include <linux/random.h>
 #include "internal.h"
@@ -643,14 +644,22 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
        if (!vma || vma->vm_start > start)
                return -EFAULT;
+        if (start > vma->vm_start)
+                prev = vma;
        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
                next = vma->vm_next;
                vmstart = max(start, vma->vm_start);
                vmend   = min(end, vma->vm_end);
-                pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+                if (mpol_equal(vma_policy(vma), new_pol))
+                        continue;
+                pgoff = vma->vm_pgoff +
+                        ((vmstart - vma->vm_start) >> PAGE_SHIFT);
                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
-                                  vma->anon_vma, vma->vm_file, pgoff, new_pol);
+                                  vma->anon_vma, vma->vm_file, pgoff,
+                                  new_pol);
                if (prev) {
                        vma = prev;
                        next = vma->vm_next;
@@ -1411,7 +1420,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy,
        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
        if (!err && nmask) {
-                err = copy_from_user(bm, nm, alloc_size);
+                unsigned long copy_size;
+                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
+                err = copy_from_user(bm, nm, copy_size);
                /* ensure entire bitmap is zeroed */
                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
                err |= compat_put_bitmap(nmask, bm, nr_bits);
@@ -1645,6 +1656,21 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
                return interleave_nodes(pol);
 }
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns -1 if nodemask is empty)
+ */
+int node_random(const nodemask_t *maskp)
+{
+        int w, bit = -1;
+        w = nodes_weight(*maskp);
+        if (w)
+                bit = bitmap_ord_to_pos(maskp->bits,
+                        get_random_int() % w, MAX_NUMNODES);
+        return bit;
+}
 #ifdef CONFIG_HUGETLBFS
 /*
 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
diff --git a/mm/migrate.c b/mm/migrate.c
index 666e4e67741..14d0a6a632f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                ptep = pte_offset_map(pmd, addr);
-                if (!is_swap_pte(*ptep)) {
+                /*
-                        pte_unmap(ptep);
+                 * Peek to check is_swap_pte() before taking ptlock?  No, we
-                        goto out;
+                 * can race mremap's move_ptes(), which skips anon_vma lock.
-                }
+                 */
                ptl = pte_lockptr(mm, pmd);
        }
diff --git a/mm/mincore.c b/mm/mincore.c
index a4e6b9d75c7..636a86876ff 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
         * file will not get a swp_entry_t in its pte, but rather it is like
         * any other file mapping (ie. marked !present and faulted in with
         * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
-         *
-         * However when tmpfs moves the page from pagecache and into swapcache,
-         * it is still in core, but the find_get_page below won't find it.
-         * No big deal, but make a note of it.
         */
        page = find_get_page(mapping, pgoff);
+#ifdef CONFIG_SWAP
+        /* shmem/tmpfs may return swap: account for swapcache page too. */
+        if (radix_tree_exceptional_entry(page)) {
+                swp_entry_t swap = radix_to_swp_entry(page);
+                page = find_get_page(&swapper_space, swap.val);
+        }
+#endif
        if (page) {
                present = PageUptodate(page);
                page_cache_release(page);
diff --git a/mm/mmap.c b/mm/mmap.c
index d49736ff8a8..a65efd4db3e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -122,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                return 0;
        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
-                unsigned long n;
+                free = global_page_state(NR_FREE_PAGES);
+                free += global_page_state(NR_FILE_PAGES);
+                /*
+                 * shmem pages shouldn't be counted as free in this
+                 * case, they can't be purged, only swapped out, and
+                 * that won't affect the overall amount of available
+                 * memory in the system.
+                 */
+                free -= global_page_state(NR_SHMEM);
-                free = global_page_state(NR_FILE_PAGES);
                free += nr_swap_pages;
                /*
@@ -136,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                free += global_page_state(NR_SLAB_RECLAIMABLE);
                /*
-                 * Leave the last 3% for root
-                 */
-                if (!cap_sys_admin)
-                        free -= free / 32;
-                if (free > pages)
-                        return 0;
-                /*
-                 * nr_free_pages() is very expensive on large systems,
-                 * only call if we're about to fail.
-                 */
-                n = nr_free_pages();
-                /*
                 * Leave reserved pages. The pages are not for anonymous pages.
                 */
-                if (n <= totalreserve_pages)
+                if (free <= totalreserve_pages)
                        goto error;
                else
-                        n -= totalreserve_pages;
+                        free -= totalreserve_pages;
                /*
                 * Leave the last 3% for root
                 */
                if (!cap_sys_admin)
-                        n -= n / 32;
+                        free -= free / 32;
-                free += n;
                if (free > pages)
                        return 0;
diff --git a/mm/nommu.c b/mm/nommu.c
index 9edc897a397..4358032566e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,6 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/tracehook.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/mount.h>
@@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file,
         * it's being traced - otherwise breakpoints set in it may interfere
         * with another untraced process
         */
-        if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
+        if ((flags & MAP_PRIVATE) && current->ptrace)
                vm_flags &= ~VM_MAYSHARE;
        return vm_flags;
@@ -1885,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                return 0;
        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
-                unsigned long n;
+                free = global_page_state(NR_FREE_PAGES);
+                free += global_page_state(NR_FILE_PAGES);
+                /*
+                 * shmem pages shouldn't be counted as free in this
+                 * case, they can't be purged, only swapped out, and
+                 * that won't affect the overall amount of available
+                 * memory in the system.
+                 */
+                free -= global_page_state(NR_SHMEM);
-                free = global_page_state(NR_FILE_PAGES);
                free += nr_swap_pages;
                /*
@@ -1899,34 +1906,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                free += global_page_state(NR_SLAB_RECLAIMABLE);
                /*
-                 * Leave the last 3% for root
-                 */
-                if (!cap_sys_admin)
-                        free -= free / 32;
-                if (free > pages)
-                        return 0;
-                /*
-                 * nr_free_pages() is very expensive on large systems,
-                 * only call if we're about to fail.
-                 */
-                n = nr_free_pages();
-                /*
                 * Leave reserved pages. The pages are not for anonymous pages.
                 */
-                if (n <= totalreserve_pages)
+                if (free <= totalreserve_pages)
                        goto error;
                else
-                        n -= totalreserve_pages;
+                        free -= totalreserve_pages;
                /*
                 * Leave the last 3% for root
                 */
                if (!cap_sys_admin)
-                        n -= n / 32;
+                        free -= free / 32;
-                free += n;
                if (free > pages)
                        return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e4b0991ca35..e9a17857a20 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -162,7 +162,7 @@ static bool oom_unkillable_task(struct task_struct *p,
 unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
                      const nodemask_t *nodemask, unsigned long totalpages)
 {
-        int points;
+        long points;
        if (oom_unkillable_task(p, mem, nodemask))
                return 0;
@@ -303,7 +303,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
        do_each_thread(g, p) {
                unsigned int points;
-                if (!p->mm)
+                if (p->exit_state)
                        continue;
                if (oom_unkillable_task(p, mem, nodemask))
                        continue;
@@ -319,6 +319,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                 */
                if (test_tsk_thread_flag(p, TIF_MEMDIE))
                        return ERR_PTR(-1UL);
+                if (!p->mm)
+                        continue;
                if (p->flags & PF_EXITING) {
                        /*
@@ -339,8 +341,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                                 * then wait for it to finish before killing
                                 * some other task unnecessarily.
                                 */
-                                if (!(task_ptrace(p->group_leader) &
+                                if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
-                                                        PT_TRACE_EXIT))
                                        return ERR_PTR(-1UL);
                        }
                }
@@ -488,7 +489,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
        /*
         * If any of p's children has a different mm and is eligible for kill,
-         * the one with the highest badness() score is sacrificed for its
+         * the one with the highest oom_badness() score is sacrificed for its
         * parent.  This attempts to lose the minimal amount of work done while
         * still freeing memory.
         */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 31f69886242..0e309cd1b5b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -37,6 +37,16 @@
 #include <trace/events/writeback.h>
 /*
+ * Sleep at most 200ms at a time in balance_dirty_pages().
+ */
+#define MAX_PAUSE               max(HZ/5, 1)
+/*
+ * Estimate write bandwidth at 200ms intervals.
+ */
+#define BANDWIDTH_INTERVAL      max(HZ/5, 1)
+/*
 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
 * will look to see if it needs to force writeback or throttling.
 */
@@ -111,6 +121,7 @@ EXPORT_SYMBOL(laptop_mode);
 /* End of sysctl-exported parameters */
+unsigned long global_dirty_limit;
 /*
 * Scale the writeback cache size proportional to the relative writeout speeds.
@@ -219,6 +230,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
 */
 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
 {
+        __inc_bdi_stat(bdi, BDI_WRITTEN);
        __prop_inc_percpu_max(&vm_completions, &bdi->completions,
                              bdi->max_prop_frac);
 }
@@ -244,13 +256,8 @@ void task_dirty_inc(struct task_struct *tsk)
 static void bdi_writeout_fraction(struct backing_dev_info *bdi,
                long *numerator, long *denominator)
 {
-        if (bdi_cap_writeback_dirty(bdi)) {
+        prop_fraction_percpu(&vm_completions, &bdi->completions,
-                prop_fraction_percpu(&vm_completions, &bdi->completions,
                                numerator, denominator);
-        } else {
-                *numerator = 0;
-                *denominator = 1;
-        }
 }
 static inline void task_dirties_fraction(struct task_struct *tsk,
@@ -274,12 +281,13 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
 * effectively curb the growth of dirty pages. Light dirtiers with high enough
 * dirty threshold may never get throttled.
 */
+#define TASK_LIMIT_FRACTION 8
 static unsigned long task_dirty_limit(struct task_struct *tsk,
                                       unsigned long bdi_dirty)
 {
        long numerator, denominator;
        unsigned long dirty = bdi_dirty;
-        u64 inv = dirty >> 3;
+        u64 inv = dirty / TASK_LIMIT_FRACTION;
        task_dirties_fraction(tsk, &numerator, &denominator);
        inv *= numerator;
@@ -290,6 +298,12 @@ static unsigned long task_dirty_limit(struct task_struct *tsk,
        return max(dirty, bdi_dirty/2);
 }
+/* Minimum limit for any task */
+static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
+{
+        return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
+}
 /*
 *
 */
@@ -397,6 +411,11 @@ unsigned long determine_dirtyable_memory(void)
        return x + 1;   /* Ensure that we never return 0 */
 }
+static unsigned long hard_dirty_limit(unsigned long thresh)
+{
+        return max(thresh, global_dirty_limit);
+}
 /*
 * global_dirty_limits - background-writeback and dirty-throttling thresholds
 *
@@ -435,12 +454,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
        }
        *pbackground = background;
        *pdirty = dirty;
+        trace_global_dirty_state(background, dirty);
 }
-/*
+/**
 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ * @bdi: the backing_dev_info to query
+ * @dirty: global dirty limit in pages
 *
- * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ * And the "limit" in the name is not seriously taken as hard limit in
+ * balance_dirty_pages().
+ *
+ * It allocates high/low dirty limits to fast/slow devices, in order to prevent
 * - starving fast devices
 * - piling up dirty pages (that will take long time to sync) on slow devices
 *
@@ -468,6 +495,153 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
        return bdi_dirty;
 }
+static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+                                       unsigned long elapsed,
+                                       unsigned long written)
+{
+        const unsigned long period = roundup_pow_of_two(3 * HZ);
+        unsigned long avg = bdi->avg_write_bandwidth;
+        unsigned long old = bdi->write_bandwidth;
+        u64 bw;
+        /*
+         * bw = written * HZ / elapsed
+         *
+         *                   bw * elapsed + write_bandwidth * (period - elapsed)
+         * write_bandwidth = ---------------------------------------------------
+         *                                          period
+         */
+        bw = written - bdi->written_stamp;
+        bw *= HZ;
+        if (unlikely(elapsed > period)) {
+                do_div(bw, elapsed);
+                avg = bw;
+                goto out;
+        }
+        bw += (u64)bdi->write_bandwidth * (period - elapsed);
+        bw >>= ilog2(period);
+        /*
+         * one more level of smoothing, for filtering out sudden spikes
+         */
+        if (avg > old && old >= (unsigned long)bw)
+                avg -= (avg - old) >> 3;
+        if (avg < old && old <= (unsigned long)bw)
+                avg += (old - avg) >> 3;
+out:
+        bdi->write_bandwidth = bw;
+        bdi->avg_write_bandwidth = avg;
+}
+/*
+ * The global dirtyable memory and dirty threshold could be suddenly knocked
+ * down by a large amount (eg. on the startup of KVM in a swapless system).
+ * This may throw the system into deep dirty exceeded state and throttle
+ * heavy/light dirtiers alike. To retain good responsiveness, maintain
+ * global_dirty_limit for tracking slowly down to the knocked down dirty
+ * threshold.
+ */
+static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+{
+        unsigned long limit = global_dirty_limit;
+        /*
+         * Follow up in one step.
+         */
+        if (limit < thresh) {
+                limit = thresh;
+                goto update;
+        }
+        /*
+         * Follow down slowly. Use the higher one as the target, because thresh
+         * may drop below dirty. This is exactly the reason to introduce
+         * global_dirty_limit which is guaranteed to lie above the dirty pages.
+         */
+        thresh = max(thresh, dirty);
+        if (limit > thresh) {
+                limit -= (limit - thresh) >> 5;
+                goto update;
+        }
+        return;
+update:
+        global_dirty_limit = limit;
+}
+static void global_update_bandwidth(unsigned long thresh,
+                                    unsigned long dirty,
+                                    unsigned long now)
+{
+        static DEFINE_SPINLOCK(dirty_lock);
+        static unsigned long update_time;
+        /*
+         * check locklessly first to optimize away locking for the most time
+         */
+        if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+                return;
+        spin_lock(&dirty_lock);
+        if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
+                update_dirty_limit(thresh, dirty);
+                update_time = now;
+        }
+        spin_unlock(&dirty_lock);
+}
+void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+                            unsigned long thresh,
+                            unsigned long dirty,
+                            unsigned long bdi_thresh,
+                            unsigned long bdi_dirty,
+                            unsigned long start_time)
+{
+        unsigned long now = jiffies;
+        unsigned long elapsed = now - bdi->bw_time_stamp;
+        unsigned long written;
+        /*
+         * rate-limit, only update once every 200ms.
+         */
+        if (elapsed < BANDWIDTH_INTERVAL)
+                return;
+        written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+        /*
+         * Skip quiet periods when disk bandwidth is under-utilized.
+         * (at least 1s idle time between two flusher runs)
+         */
+        if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+                goto snapshot;
+        if (thresh)
+                global_update_bandwidth(thresh, dirty, now);
+        bdi_update_write_bandwidth(bdi, elapsed, written);
+snapshot:
+        bdi->written_stamp = written;
+        bdi->bw_time_stamp = now;
+}
+static void bdi_update_bandwidth(struct backing_dev_info *bdi,
+                                 unsigned long thresh,
+                                 unsigned long dirty,
+                                 unsigned long bdi_thresh,
+                                 unsigned long bdi_dirty,
+                                 unsigned long start_time)
+{
+        if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
+                return;
+        spin_lock(&bdi->wb.list_lock);
+        __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
+                               start_time);
+        spin_unlock(&bdi->wb.list_lock);
+}
 /*
 * balance_dirty_pages() must be called by processes which are generating dirty
 * data.  It looks at the number of dirty pages in the machine and will force
@@ -478,27 +652,25 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
 static void balance_dirty_pages(struct address_space *mapping,
                                unsigned long write_chunk)
 {
-        long nr_reclaimable, bdi_nr_reclaimable;
+        unsigned long nr_reclaimable, bdi_nr_reclaimable;
-        long nr_writeback, bdi_nr_writeback;
+        unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
+        unsigned long bdi_dirty;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        unsigned long bdi_thresh;
+        unsigned long task_bdi_thresh;
+        unsigned long min_task_bdi_thresh;
        unsigned long pages_written = 0;
        unsigned long pause = 1;
        bool dirty_exceeded = false;
+        bool clear_dirty_exceeded = true;
        struct backing_dev_info *bdi = mapping->backing_dev_info;
+        unsigned long start_time = jiffies;
        for (;;) {
-                struct writeback_control wbc = {
-                        .sync_mode      = WB_SYNC_NONE,
-                        .older_than_this = NULL,
-                        .nr_to_write    = write_chunk,
-                        .range_cyclic   = 1,
-                };
                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
                                        global_page_state(NR_UNSTABLE_NFS);
-                nr_writeback = global_page_state(NR_WRITEBACK);
+                nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
                global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -507,12 +679,12 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * catch-up. This avoids (excessively) small writeouts
                 * when the bdi limits are ramping up.
                 */
-                if (nr_reclaimable + nr_writeback <=
+                if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
-                                (background_thresh + dirty_thresh) / 2)
                        break;
                bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-                bdi_thresh = task_dirty_limit(current, bdi_thresh);
+                min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
+                task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
                /*
                 * In order to avoid the stacked BDI deadlock we need
@@ -524,12 +696,14 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * actually dirty; with m+n sitting in the percpu
                 * deltas.
                 */
-                if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+                if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
                        bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-                        bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+                        bdi_dirty = bdi_nr_reclaimable +
+                                    bdi_stat_sum(bdi, BDI_WRITEBACK);
                } else {
                        bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-                        bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+                        bdi_dirty = bdi_nr_reclaimable +
+                                    bdi_stat(bdi, BDI_WRITEBACK);
                }
                /*
@@ -538,9 +712,10 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * bdi or process from holding back light ones; The latter is
                 * the last resort safeguard.
                 */
-                dirty_exceeded =
+                dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
-                        (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
+                                  (nr_dirty > dirty_thresh);
-                        || (nr_reclaimable + nr_writeback > dirty_thresh);
+                clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
+                                        (nr_dirty <= dirty_thresh);
                if (!dirty_exceeded)
                        break;
@@ -548,6 +723,9 @@ static void balance_dirty_pages(struct address_space *mapping,
                if (!bdi->dirty_exceeded)
                        bdi->dirty_exceeded = 1;
+                bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
+                                     bdi_thresh, bdi_dirty, start_time);
                /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
                 * Unstable writes are a feature of certain networked
                 * filesystems (i.e. NFS) in which data may have been
@@ -557,17 +735,29 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * threshold otherwise wait until the disk writes catch
                 * up.
                 */
-                trace_wbc_balance_dirty_start(&wbc, bdi);
+                trace_balance_dirty_start(bdi);
-                if (bdi_nr_reclaimable > bdi_thresh) {
+                if (bdi_nr_reclaimable > task_bdi_thresh) {
-                        writeback_inodes_wb(&bdi->wb, &wbc);
+                        pages_written += writeback_inodes_wb(&bdi->wb,
-                        pages_written += write_chunk - wbc.nr_to_write;
+                                                             write_chunk);
-                        trace_wbc_balance_dirty_written(&wbc, bdi);
+                        trace_balance_dirty_written(bdi, pages_written);
                        if (pages_written >= write_chunk)
                                break;          /* We've done our duty */
                }
-                trace_wbc_balance_dirty_wait(&wbc, bdi);
                __set_current_state(TASK_UNINTERRUPTIBLE);
                io_schedule_timeout(pause);
+                trace_balance_dirty_wait(bdi);
+                dirty_thresh = hard_dirty_limit(dirty_thresh);
+                /*
+                 * max-pause area. If dirty exceeded but still within this
+                 * area, no need to sleep for more than 200ms: (a) 8 pages per
+                 * 200ms is typically more than enough to curb heavy dirtiers;
+                 * (b) the pause time limit makes the dirtiers more responsive.
+                 */
+                if (nr_dirty < dirty_thresh &&
+                    bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 &&
+                    time_after(jiffies, start_time + MAX_PAUSE))
+                        break;
                /*
                 * Increase the delay for each loop, up to our previous
@@ -578,7 +768,8 @@ static void balance_dirty_pages(struct address_space *mapping,
                        pause = HZ / 10;
        }
-        if (!dirty_exceeded && bdi->dirty_exceeded)
+        /* Clear dirty_exceeded flag only when no task can exceed the limit */
+        if (clear_dirty_exceeded && bdi->dirty_exceeded)
                bdi->dirty_exceeded = 0;
        if (writeback_in_progress(bdi))
@@ -626,9 +817,13 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
                                        unsigned long nr_pages_dirtied)
 {
+        struct backing_dev_info *bdi = mapping->backing_dev_info;
        unsigned long ratelimit;
        unsigned long *p;
+        if (!bdi_cap_account_dirty(bdi))
+                return;
        ratelimit = ratelimit_pages;
        if (mapping->backing_dev_info->dirty_exceeded)
                ratelimit = 8;
@@ -892,12 +1087,12 @@ int write_cache_pages(struct address_space *mapping,
                        range_whole = 1;
                cycled = 1; /* ignore range_cyclic tests */
        }
-        if (wbc->sync_mode == WB_SYNC_ALL)
+        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;
 retry:
-        if (wbc->sync_mode == WB_SYNC_ALL)
+        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, index, end);
        done_index = index;
        while (!done && (index <= end)) {
@@ -1141,7 +1336,6 @@ EXPORT_SYMBOL(account_page_dirtied);
 void account_page_writeback(struct page *page)
 {
        inc_zone_page_state(page, NR_WRITEBACK);
-        inc_zone_page_state(page, NR_WRITTEN);
 }
 EXPORT_SYMBOL(account_page_writeback);
@@ -1358,8 +1552,10 @@ int test_clear_page_writeback(struct page *page)
        } else {
                ret = TestClearPageWriteback(page);
        }
-        if (ret)
+        if (ret) {
                dec_zone_page_state(page, NR_WRITEBACK);
+                inc_zone_page_state(page, NR_WRITTEN);
+        }
        return ret;
 }
@@ -1405,10 +1601,6 @@ EXPORT_SYMBOL(test_set_page_writeback);
 */
 int mapping_tagged(struct address_space *mapping, int tag)
 {
-        int ret;
+        return radix_tree_tagged(&mapping->page_tree, tag);
-        rcu_read_lock();
-        ret = radix_tree_tagged(&mapping->page_tree, tag);
-        rcu_read_unlock();
-        return ret;
 }
 EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e8985acdab..8859578e4bd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -127,6 +127,20 @@ void pm_restrict_gfp_mask(void)
        saved_gfp_mask = gfp_allowed_mask;
        gfp_allowed_mask &= ~GFP_IOFS;
 }
+static bool pm_suspending(void)
+{
+        if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+                return false;
+        return true;
+}
+#else
+static bool pm_suspending(void)
+{
+        return false;
+}
 #endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -176,6 +190,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
 };
 int min_free_kbytes = 1024;
+int min_free_order_shift = 1;
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -355,8 +370,8 @@ void prep_compound_page(struct page *page, unsigned long order)
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
                __SetPageTail(p);
+                set_page_count(p, 0);
                p->first_page = page;
        }
 }
@@ -1370,21 +1385,12 @@ failed:
 #ifdef CONFIG_FAIL_PAGE_ALLOC
-static struct fail_page_alloc_attr {
+static struct {
        struct fault_attr attr;
        u32 ignore_gfp_highmem;
        u32 ignore_gfp_wait;
        u32 min_order;
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-        struct dentry *ignore_gfp_highmem_file;
-        struct dentry *ignore_gfp_wait_file;
-        struct dentry *min_order_file;
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 } fail_page_alloc = {
        .attr = FAULT_ATTR_INITIALIZER,
        .ignore_gfp_wait = 1,
@@ -1418,36 +1424,27 @@ static int __init fail_page_alloc_debugfs(void)
 {
        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
        struct dentry *dir;
-        int err;
-        err = init_fault_attr_dentries(&fail_page_alloc.attr,
-                                       "fail_page_alloc");
-        if (err)
-                return err;
-        dir = fail_page_alloc.attr.dentries.dir;
-        fail_page_alloc.ignore_gfp_wait_file =
-                debugfs_create_bool("ignore-gfp-wait", mode, dir,
-                                      &fail_page_alloc.ignore_gfp_wait);
-        fail_page_alloc.ignore_gfp_highmem_file =
-                debugfs_create_bool("ignore-gfp-highmem", mode, dir,
-                                      &fail_page_alloc.ignore_gfp_highmem);
-        fail_page_alloc.min_order_file =
-                debugfs_create_u32("min-order", mode, dir,
-                                   &fail_page_alloc.min_order);
-        if (!fail_page_alloc.ignore_gfp_wait_file ||
-            !fail_page_alloc.ignore_gfp_highmem_file ||
-            !fail_page_alloc.min_order_file) {
-                err = -ENOMEM;
-                debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
-                debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
-                debugfs_remove(fail_page_alloc.min_order_file);
-                cleanup_fault_attr_dentries(&fail_page_alloc.attr);
-        }
-        return err;
+        dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
+                                        &fail_page_alloc.attr);
+        if (IS_ERR(dir))
+                return PTR_ERR(dir);
+        if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
+                                &fail_page_alloc.ignore_gfp_wait))
+                goto fail;
+        if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+                                &fail_page_alloc.ignore_gfp_highmem))
+                goto fail;
+        if (!debugfs_create_u32("min-order", mode, dir,
+                                &fail_page_alloc.min_order))
+                goto fail;
+        return 0;
+fail:
+        debugfs_remove_recursive(dir);
+        return -ENOMEM;
 }
 late_initcall(fail_page_alloc_debugfs);
@@ -1487,7 +1484,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                free_pages -= z->free_area[o].nr_free << o;
                /* Require fewer higher order pages to be free */
-                min >>= 1;
+                min >>= min_free_order_shift;
                if (free_pages <= min)
                        return false;
@@ -1616,6 +1613,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
        set_bit(i, zlc->fullzones);
 }
+/*
+ * clear all zones full, called after direct reclaim makes progress so that
+ * a zone that was recently full is not skipped over for up to a second
+ */
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+        struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+        zlc = zonelist->zlcache_ptr;
+        if (!zlc)
+                return;
+        bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+}
 #else   /* CONFIG_NUMA */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1632,6 +1644,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+}
 #endif  /* CONFIG_NUMA */
 /*
@@ -1664,7 +1680,7 @@ zonelist_scan:
                                continue;
                if ((alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
-                                goto try_next_zone;
+                                continue;
                BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1676,17 +1692,36 @@ zonelist_scan:
                                    classzone_idx, alloc_flags))
                                goto try_this_zone;
+                        if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+                                /*
+                                 * we do zlc_setup if there are multiple nodes
+                                 * and before considering the first zone allowed
+                                 * by the cpuset.
+                                 */
+                                allowednodes = zlc_setup(zonelist, alloc_flags);
+                                zlc_active = 1;
+                                did_zlc_setup = 1;
+                        }
                        if (zone_reclaim_mode == 0)
                                goto this_zone_full;
+                        /*
+                         * As we may have just activated ZLC, check if the first
+                         * eligible zone has failed zone_reclaim recently.
+                         */
+                        if (NUMA_BUILD && zlc_active &&
+                                !zlc_zone_worth_trying(zonelist, z, allowednodes))
+                                continue;
                        ret = zone_reclaim(zone, gfp_mask, order);
                        switch (ret) {
                        case ZONE_RECLAIM_NOSCAN:
                                /* did not scan */
-                                goto try_next_zone;
+                                continue;
                        case ZONE_RECLAIM_FULL:
                                /* scanned but unreclaimable */
-                                goto this_zone_full;
+                                continue;
                        default:
                                /* did we reclaim enough */
                                if (!zone_watermark_ok(zone, order, mark,
@@ -1703,16 +1738,6 @@ try_this_zone:
 this_zone_full:
                if (NUMA_BUILD)
                        zlc_mark_zone_full(zonelist, z);
-try_next_zone:
-                if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
-                        /*
-                         * we do zlc_setup after the first zone is tried but only
-                         * if there are multiple nodes make it worthwhile
-                         */
-                        allowednodes = zlc_setup(zonelist, alloc_flags);
-                        zlc_active = 1;
-                        did_zlc_setup = 1;
-                }
        }
        if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1954,6 +1979,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!(*did_some_progress)))
                return NULL;
+        /* After successful reclaim, reconsider all zones for allocation */
+        if (NUMA_BUILD)
+                zlc_clear_zones_full(zonelist);
 retry:
        page = get_page_from_freelist(gfp_mask, nodemask, order,
                                        zonelist, high_zoneidx,
@@ -2193,6 +2222,14 @@ rebalance:
                        goto restart;
                }
+                /*
+                 * Suspend converts GFP_KERNEL to __GFP_WAIT which can
+                 * prevent reclaim making forward progress without
+                 * invoking OOM. Bail if we are suspending
+                 */
+                if (pm_suspending())
+                        goto nopage;
        }
        /* Check if we should retry the allocation */
@@ -3356,9 +3393,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
        unsigned long block_migratetype;
        int reserve;
-        /* Get the start pfn, end pfn and the number of blocks to reserve */
+        /*
+         * Get the start pfn, end pfn and the number of blocks to reserve
+         * We have to be careful to be aligned to pageblock_nr_pages to
+         * make sure that we always check pfn_valid for the first page in
+         * the block.
+         */
        start_pfn = zone->zone_start_pfn;
        end_pfn = start_pfn + zone->spanned_pages;
+        start_pfn = roundup(start_pfn, pageblock_nr_pages);
        reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
                                                        pageblock_order;
@@ -4585,6 +4628,60 @@ void __init sort_node_map(void)
                        cmp_node_active_region, NULL);
 }
+/**
+ * node_map_pfn_alignment - determine the maximum internode alignment
+ *
+ * This function should be called after node map is populated and sorted.
+ * It calculates the maximum power of two alignment which can distinguish
+ * all the nodes.
+ *
+ * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
+ * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
+ * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
+ * shifted, 1GiB is enough and this function will indicate so.
+ *
+ * This is used to test whether pfn -> nid mapping of the chosen memory
+ * model has fine enough granularity to avoid incorrect mapping for the
+ * populated node map.
+ *
+ * Returns the determined alignment in pfn's.  0 if there is no alignment
+ * requirement (single node).
+ */
+unsigned long __init node_map_pfn_alignment(void)
+{
+        unsigned long accl_mask = 0, last_end = 0;
+        int last_nid = -1;
+        int i;
+        for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
+                int nid = early_node_map[i].nid;
+                unsigned long start = early_node_map[i].start_pfn;
+                unsigned long end = early_node_map[i].end_pfn;
+                unsigned long mask;
+                if (!start || last_nid < 0 || last_nid == nid) {
+                        last_nid = nid;
+                        last_end = end;
+                        continue;
+                }
+                /*
+                 * Start with a mask granular enough to pin-point to the
+                 * start pfn and tick off bits one-by-one until it becomes
+                 * too coarse to separate the current node from the last.
+                 */
+                mask = ~((1 << __ffs(start)) - 1);
+                while (mask && last_end <= (start & (mask << 1)))
+                        mask <<= 1;
+                /* accumulate all internode masks */
+                accl_mask |= mask;
+        }
+        /* convert mask to number of pages */
+        return ~accl_mask + 1;
+}
 /* Find the lowest pfn for a node */
 static unsigned long __init find_min_pfn_for_node(int nid)
 {
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 53bffc6c293..39d216d535e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -225,8 +225,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
        unsigned long start, end, pfn;
        int fail = 0;
-        start = start_pfn & ~(PAGES_PER_SECTION - 1);
+        start = SECTION_ALIGN_DOWN(start_pfn);
-        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
        if (nid == -1) {
                /*
@@ -258,8 +258,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn,
 {
        unsigned long start, end, pfn;
-        start = start_pfn & ~(PAGES_PER_SECTION - 1);
+        start = SECTION_ALIGN_DOWN(start_pfn);
-        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+        end = SECTION_ALIGN_UP(start_pfn + nr_pages);
        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
                __free_page_cgroup(pfn);
@@ -537,7 +537,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
 nomem:
        printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
        printk(KERN_INFO
-                "swap_cgroup can be disabled by noswapaccount boot option\n");
+                "swap_cgroup can be disabled by swapaccount=0 boot option\n");
        return -ENOMEM;
 }
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index c3450d53361..2f5cf10ff66 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -126,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
        return 0;
 }
-#endif
+static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
+{
+        struct vm_area_struct *vma;
+        /* We don't need vma lookup at all. */
+        if (!walk->hugetlb_entry)
+                return NULL;
+        VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+        vma = find_vma(walk->mm, addr);
+        if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
+                return vma;
+        return NULL;
+}
+#else /* CONFIG_HUGETLB_PAGE */
+static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
+{
+        return NULL;
+}
+static int walk_hugetlb_range(struct vm_area_struct *vma,
+                              unsigned long addr, unsigned long end,
+                              struct mm_walk *walk)
+{
+        return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
 /**
 * walk_page_range - walk a memory map's page tables with a callback
@@ -144,11 +176,15 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
 * associated range, and a copy of the original mm_walk for access to
 * the ->private or ->mm fields.
 *
- * No locks are taken, but the bottom level iterator will map PTE
+ * Usually no locks are taken, but splitting transparent huge page may
+ * take page table lock. And the bottom level iterator will map PTE
 * directories from highmem if necessary.
 *
 * If any callback returns a non-zero value, the walk is aborted and
 * the return value is propagated back to the caller. Otherwise 0 is returned.
+ *
+ * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
+ * is !NULL.
 */
 int walk_page_range(unsigned long addr, unsigned long end,
                    struct mm_walk *walk)
@@ -165,18 +201,17 @@ int walk_page_range(unsigned long addr, unsigned long end,
        pgd = pgd_offset(walk->mm, addr);
        do {
-                struct vm_area_struct *uninitialized_var(vma);
+                struct vm_area_struct *vma;
                next = pgd_addr_end(addr, end);
-#ifdef CONFIG_HUGETLB_PAGE
                /*
                 * handle hugetlb vma individually because pagetable walk for
                 * the hugetlb page is dependent on the architecture and
                 * we can't handled it in the same manner as non-huge pages.
                 */
-                vma = find_vma(walk->mm, addr);
+                vma = hugetlb_vma(addr, walk);
-                if (vma && is_vm_hugetlb_page(vma)) {
+                if (vma) {
                        if (vma->vm_end < next)
                                next = vma->vm_end;
                        /*
@@ -189,7 +224,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
                        pgd = pgd_offset(walk->mm, next);
                        continue;
                }
-#endif
                if (pgd_none_or_clear_bad(pgd)) {
                        if (walk->pte_hole)
                                err = walk->pte_hole(addr, next, walk);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index ea534960a04..bfad7246665 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -143,8 +143,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
                                 int page_start, int page_end)
 {
        flush_cache_vunmap(
-                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+                pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
-                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+                pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
 }
 static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
@@ -206,8 +206,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
                                      int page_start, int page_end)
 {
        flush_tlb_kernel_range(
-                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+                pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
-                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+                pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
 }
 static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@@ -284,8 +284,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
                                int page_start, int page_end)
 {
        flush_cache_vmap(
-                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+                pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
-                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+                pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
 }
 /**
diff --git a/mm/percpu.c b/mm/percpu.c
index bf80e55dbed..0ae7a09141e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -116,9 +116,9 @@ static int pcpu_atom_size __read_mostly;
 static int pcpu_nr_slots __read_mostly;
 static size_t pcpu_chunk_struct_size __read_mostly;
-/* cpus with the lowest and highest unit numbers */
+/* cpus with the lowest and highest unit addresses */
-static unsigned int pcpu_first_unit_cpu __read_mostly;
+static unsigned int pcpu_low_unit_cpu __read_mostly;
-static unsigned int pcpu_last_unit_cpu __read_mostly;
+static unsigned int pcpu_high_unit_cpu __read_mostly;
 /* the address of the first chunk which starts with the kernel static area */
 void *pcpu_base_addr __read_mostly;
@@ -984,19 +984,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
 {
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        bool in_first_chunk = false;
-        unsigned long first_start, first_end;
+        unsigned long first_low, first_high;
        unsigned int cpu;
        /*
-         * The following test on first_start/end isn't strictly
+         * The following test on unit_low/high isn't strictly
         * necessary but will speed up lookups of addresses which
         * aren't in the first chunk.
         */
-        first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0);
+        first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
-        first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu,
+        first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
-                                    pcpu_unit_pages);
+                                     pcpu_unit_pages);
-        if ((unsigned long)addr >= first_start &&
+        if ((unsigned long)addr >= first_low &&
-            (unsigned long)addr < first_end) {
+            (unsigned long)addr < first_high) {
                for_each_possible_cpu(cpu) {
                        void *start = per_cpu_ptr(base, cpu);
@@ -1011,9 +1011,11 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
                if (!is_vmalloc_addr(addr))
                        return __pa(addr);
                else
-                        return page_to_phys(vmalloc_to_page(addr));
+                        return page_to_phys(vmalloc_to_page(addr)) +
+                               offset_in_page(addr);
        } else
-                return page_to_phys(pcpu_addr_to_page(addr));
+                return page_to_phys(pcpu_addr_to_page(addr)) +
+                       offset_in_page(addr);
 }
 /**
@@ -1233,7 +1235,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        for (cpu = 0; cpu < nr_cpu_ids; cpu++)
                unit_map[cpu] = UINT_MAX;
-        pcpu_first_unit_cpu = NR_CPUS;
+        pcpu_low_unit_cpu = NR_CPUS;
+        pcpu_high_unit_cpu = NR_CPUS;
        for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
                const struct pcpu_group_info *gi = &ai->groups[group];
@@ -1253,9 +1257,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                        unit_map[cpu] = unit + i;
                        unit_off[cpu] = gi->base_offset + i * ai->unit_size;
-                        if (pcpu_first_unit_cpu == NR_CPUS)
+                        /* determine low/high unit_cpu */
-                                pcpu_first_unit_cpu = cpu;
+                        if (pcpu_low_unit_cpu == NR_CPUS ||
-                        pcpu_last_unit_cpu = cpu;
+                            unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
+                                pcpu_low_unit_cpu = cpu;
+                        if (pcpu_high_unit_cpu == NR_CPUS ||
+                            unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
+                                pcpu_high_unit_cpu = cpu;
                }
        }
        pcpu_nr_units = unit;
diff --git a/mm/rmap.c b/mm/rmap.c
index 23295f65ae4..8005080fb9e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,7 +21,6 @@
 * Lock ordering in mm:
 *
 * inode->i_mutex       (while writing or truncating, not reading or faulting)
- *   inode->i_alloc_sem (vmtruncate_range)
 *   mm->mmap_sem
 *     page->flags PG_locked (lock_page)
 *       mapping->i_mmap_mutex
@@ -32,11 +31,11 @@
 *               mmlist_lock (in mmput, drain_mmlist and others)
 *               mapping->private_lock (in __set_page_dirty_buffers)
 *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- *               inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
+ *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
 *                 sb_lock (within inode_lock in fs/fs-writeback.c)
 *                 mapping->tree_lock (widely used, in set_page_dirty,
 *                           in arch-dependent flush_dcache_mmap_lock,
- *                           within inode_wb_list_lock in __sync_single_inode)
+ *                           within bdi.wb->list_lock in __sync_single_inode)
 *
 * anon_vma->mutex,mapping->i_mutex      (memory_failure, collect_procs_anon)
 *   ->tasklist_lock
@@ -870,11 +869,11 @@ int page_referenced(struct page *page,
                                                                vm_flags);
                if (we_locked)
                        unlock_page(page);
+                if (page_test_and_clear_young(page_to_pfn(page)))
+                        referenced++;
        }
 out:
-        if (page_test_and_clear_young(page_to_pfn(page)))
-                referenced++;
        return referenced;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index fcedf5464eb..fba53caba0d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,7 +6,8 @@
 *               2000-2001 Christoph Rohland
 *               2000-2001 SAP AG
 *               2002 Red Hat Inc.
- * Copyright (C) 2002-2005 Hugh Dickins.
+ * Copyright (C) 2002-2011 Hugh Dickins.
+ * Copyright (C) 2011 Google Inc.
 * Copyright (C) 2002-2005 VERITAS Software Corporation.
 * Copyright (C) 2004 Andi Kleen, SuSE Labs
 *
@@ -28,7 +29,6 @@
 #include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/module.h>
-#include <linux/percpu_counter.h>
 #include <linux/swap.h>
 static struct vfsmount *shm_mnt;
@@ -51,6 +51,9 @@ static struct vfsmount *shm_mnt;
 #include <linux/shmem_fs.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/pagevec.h>
+#include <linux/percpu_counter.h>
+#include <linux/splice.h>
 #include <linux/security.h>
 #include <linux/swapops.h>
 #include <linux/mempolicy.h>
@@ -62,43 +65,17 @@ static struct vfsmount *shm_mnt;
 #include <linux/magic.h>
 #include <asm/uaccess.h>
-#include <asm/div64.h>
 #include <asm/pgtable.h>
-/*
- * The maximum size of a shmem/tmpfs file is limited by the maximum size of
- * its triple-indirect swap vector - see illustration at shmem_swp_entry().
- *
- * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
- * but one eighth of that on a 64-bit kernel.  With 8kB page size, maximum
- * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
- * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
- *
- * We use / and * instead of shifts in the definitions below, so that the swap
- * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
- */
-#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
-#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
-#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
-#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
-#define SHMEM_MAX_BYTES  min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
-#define SHMEM_MAX_INDEX  ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
 #define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
-/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
-#define SHMEM_PAGEIN     VM_READ
-#define SHMEM_TRUNCATE   VM_WRITE
-/* Definition to limit shmem_truncate's steps between cond_rescheds */
-#define LATENCY_LIMIT    64
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
+/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
+#define SHORT_SYMLINK_LEN 128
 struct shmem_xattr {
        struct list_head list;  /* anchored by shmem_inode_info->xattr_list */
        char *name;             /* xattr name */
@@ -106,7 +83,7 @@ struct shmem_xattr {
        char value[0];
 };
-/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
+/* Flag allocation requirements to shmem_getpage */
 enum sgp_type {
        SGP_READ,       /* don't exceed i_size, don't allocate page */
        SGP_CACHE,      /* don't exceed i_size, may allocate page */
@@ -126,57 +103,14 @@ static unsigned long shmem_default_max_inodes(void)
 }
 #endif
-static int shmem_getpage(struct inode *inode, unsigned long idx,
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
-                         struct page **pagep, enum sgp_type sgp, int *type);
+        struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
-static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
-{
-        /*
-         * The above definition of ENTRIES_PER_PAGE, and the use of
-         * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
-         * might be reconsidered if it ever diverges from PAGE_SIZE.
-         *
-         * Mobility flags are masked out as swap vectors cannot move
-         */
-        return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
-                                PAGE_CACHE_SHIFT-PAGE_SHIFT);
-}
-static inline void shmem_dir_free(struct page *page)
-{
-        __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
-}
-static struct page **shmem_dir_map(struct page *page)
-{
-        return (struct page **)kmap_atomic(page, KM_USER0);
-}
-static inline void shmem_dir_unmap(struct page **dir)
-{
-        kunmap_atomic(dir, KM_USER0);
-}
-static swp_entry_t *shmem_swp_map(struct page *page)
-{
-        return (swp_entry_t *)kmap_atomic(page, KM_USER1);
-}
-static inline void shmem_swp_balance_unmap(void)
-{
-        /*
-         * When passing a pointer to an i_direct entry, to code which
-         * also handles indirect entries and so will shmem_swp_unmap,
-         * we must arrange for the preempt count to remain in balance.
-         * What kmap_atomic of a lowmem page does depends on config
-         * and architecture, so pretend to kmap_atomic some lowmem page.
-         */
-        (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
-}
-static inline void shmem_swp_unmap(swp_entry_t *entry)
+static inline int shmem_getpage(struct inode *inode, pgoff_t index,
+        struct page **pagep, enum sgp_type sgp, int *fault_type)
 {
-        kunmap_atomic(entry, KM_USER1);
+        return shmem_getpage_gfp(inode, index, pagep, sgp,
+                        mapping_gfp_mask(inode->i_mapping), fault_type);
 }
 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@ -236,17 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 static LIST_HEAD(shmem_swaplist);
 static DEFINE_MUTEX(shmem_swaplist_mutex);
-static void shmem_free_blocks(struct inode *inode, long pages)
-{
-        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-        if (sbinfo->max_blocks) {
-                percpu_counter_add(&sbinfo->used_blocks, -pages);
-                spin_lock(&inode->i_lock);
-                inode->i_blocks -= pages*BLOCKS_PER_PAGE;
-                spin_unlock(&inode->i_lock);
-        }
-}
 static int shmem_reserve_inode(struct super_block *sb)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@@ -273,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb)
 }
 /**
- * shmem_recalc_inode - recalculate the size of an inode
+ * shmem_recalc_inode - recalculate the block usage of an inode
 * @inode: inode to recalc
 *
 * We have to calculate the free blocks since the mm can drop
@@ -291,474 +214,297 @@ static void shmem_recalc_inode(struct inode *inode)
        freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
        if (freed > 0) {
+                struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+                if (sbinfo->max_blocks)
+                        percpu_counter_add(&sbinfo->used_blocks, -freed);
                info->alloced -= freed;
+                inode->i_blocks -= freed * BLOCKS_PER_PAGE;
                shmem_unacct_blocks(info->flags, freed);
-                shmem_free_blocks(inode, freed);
        }
 }
-/**
+/*
- * shmem_swp_entry - find the swap vector position in the info structure
+ * Replace item expected in radix tree by a new item, while holding tree lock.
- * @info:  info structure for the inode
+ */
- * @index: index of the page to find
+static int shmem_radix_tree_replace(struct address_space *mapping,
- * @page:  optional page to add to the structure. Has to be preset to
+                        pgoff_t index, void *expected, void *replacement)
- *         all zeros
+{
- *
+        void **pslot;
- * If there is no space allocated yet it will return NULL when
+        void *item = NULL;
- * page is NULL, else it will use the page for the needed block,
- * setting it to NULL on return to indicate that it has been used.
+        VM_BUG_ON(!expected);
- *
+        pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
- * The swap vector is organized the following way:
+        if (pslot)
- *
+                item = radix_tree_deref_slot_protected(pslot,
- * There are SHMEM_NR_DIRECT entries directly stored in the
+                                                        &mapping->tree_lock);
- * shmem_inode_info structure. So small files do not need an addional
+        if (item != expected)
- * allocation.
+                return -ENOENT;
- *
+        if (replacement)
- * For pages with index > SHMEM_NR_DIRECT there is the pointer
+                radix_tree_replace_slot(pslot, replacement);
- * i_indirect which points to a page which holds in the first half
+        else
- * doubly indirect blocks, in the second half triple indirect blocks:
+                radix_tree_delete(&mapping->page_tree, index);
- *
+        return 0;
- * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
+}
- * following layout (for SHMEM_NR_DIRECT == 16):
- *
+/*
- * i_indirect -> dir --> 16-19
+ * Like add_to_page_cache_locked, but error if expected item has gone.
- *            |      +-> 20-23
- *            |
- *            +-->dir2 --> 24-27
- *            |        +-> 28-31
- *            |        +-> 32-35
- *            |        +-> 36-39
- *            |
- *            +-->dir3 --> 40-43
- *                     +-> 44-47
- *                     +-> 48-51
- *                     +-> 52-55
 */
-static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
+static int shmem_add_to_page_cache(struct page *page,
+                                   struct address_space *mapping,
+                                   pgoff_t index, gfp_t gfp, void *expected)
 {
-        unsigned long offset;
+        int error = 0;
-        struct page **dir;
-        struct page *subdir;
-        if (index < SHMEM_NR_DIRECT) {
+        VM_BUG_ON(!PageLocked(page));
-                shmem_swp_balance_unmap();
+        VM_BUG_ON(!PageSwapBacked(page));
-                return info->i_direct+index;
-        }
-        if (!info->i_indirect) {
-                if (page) {
-                        info->i_indirect = *page;
-                        *page = NULL;
-                }
-                return NULL;                    /* need another page */
-        }
-        index -= SHMEM_NR_DIRECT;
+        if (!expected)
-        offset = index % ENTRIES_PER_PAGE;
+                error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
-        index /= ENTRIES_PER_PAGE;
+        if (!error) {
-        dir = shmem_dir_map(info->i_indirect);
+                page_cache_get(page);
+                page->mapping = mapping;
-        if (index >= ENTRIES_PER_PAGE/2) {
+                page->index = index;
-                index -= ENTRIES_PER_PAGE/2;
-                dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
+                spin_lock_irq(&mapping->tree_lock);
-                index %= ENTRIES_PER_PAGE;
+                if (!expected)
-                subdir = *dir;
+                        error = radix_tree_insert(&mapping->page_tree,
-                if (!subdir) {
+                                                        index, page);
-                        if (page) {
+                else
-                                *dir = *page;
+                        error = shmem_radix_tree_replace(mapping, index,
-                                *page = NULL;
+                                                        expected, page);
-                        }
+                if (!error) {
-                        shmem_dir_unmap(dir);
+                        mapping->nrpages++;
-                        return NULL;            /* need another page */
+                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                }
+                        __inc_zone_page_state(page, NR_SHMEM);
-                shmem_dir_unmap(dir);
+                        spin_unlock_irq(&mapping->tree_lock);
-                dir = shmem_dir_map(subdir);
+                } else {
-        }
+                        page->mapping = NULL;
+                        spin_unlock_irq(&mapping->tree_lock);
-        dir += index;
+                        page_cache_release(page);
-        subdir = *dir;
-        if (!subdir) {
-                if (!page || !(subdir = *page)) {
-                        shmem_dir_unmap(dir);
-                        return NULL;            /* need a page */
                }
-                *dir = subdir;
+                if (!expected)
-                *page = NULL;
+                        radix_tree_preload_end();
        }
-        shmem_dir_unmap(dir);
+        if (error)
-        return shmem_swp_map(subdir) + offset;
+                mem_cgroup_uncharge_cache_page(page);
+        return error;
 }
-static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
+/*
+ * Like delete_from_page_cache, but substitutes swap for page.
+ */
+static void shmem_delete_from_page_cache(struct page *page, void *radswap)
 {
-        long incdec = value? 1: -1;
+        struct address_space *mapping = page->mapping;
+        int error;
-        entry->val = value;
+        spin_lock_irq(&mapping->tree_lock);
-        info->swapped += incdec;
+        error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
-        if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
+        page->mapping = NULL;
-                struct page *page = kmap_atomic_to_page(entry);
+        mapping->nrpages--;
-                set_page_private(page, page_private(page) + incdec);
+        __dec_zone_page_state(page, NR_FILE_PAGES);
-        }
+        __dec_zone_page_state(page, NR_SHMEM);
+        spin_unlock_irq(&mapping->tree_lock);
+        page_cache_release(page);
+        BUG_ON(error);
 }
-/**
+/*
- * shmem_swp_alloc - get the position of the swap entry for the page.
+ * Like find_get_pages, but collecting swap entries as well as pages.
- * @info:       info structure for the inode
- * @index:      index of the page to find
- * @sgp:        check and recheck i_size? skip allocation?
- *
- * If the entry does not exist, allocate it.
 */
-static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
+static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
-{
+                                        pgoff_t start, unsigned int nr_pages,
-        struct inode *inode = &info->vfs_inode;
+                                        struct page **pages, pgoff_t *indices)
-        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+{
-        struct page *page = NULL;
+        unsigned int i;
-        swp_entry_t *entry;
+        unsigned int ret;
+        unsigned int nr_found;
-        if (sgp != SGP_WRITE &&
-            ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+        rcu_read_lock();
-                return ERR_PTR(-EINVAL);
+restart:
+        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-        while (!(entry = shmem_swp_entry(info, index, &page))) {
+                                (void ***)pages, indices, start, nr_pages);
-                if (sgp == SGP_READ)
+        ret = 0;
-                        return shmem_swp_map(ZERO_PAGE(0));
+        for (i = 0; i < nr_found; i++) {
-                /*
+                struct page *page;
-                 * Test used_blocks against 1 less max_blocks, since we have 1 data
+repeat:
-                 * page (and perhaps indirect index pages) yet to allocate:
+                page = radix_tree_deref_slot((void **)pages[i]);
-                 * a waste to allocate index if we cannot allocate data.
+                if (unlikely(!page))
-                 */
+                        continue;
-                if (sbinfo->max_blocks) {
+                if (radix_tree_exception(page)) {
-                        if (percpu_counter_compare(&sbinfo->used_blocks,
+                        if (radix_tree_deref_retry(page))
-                                                sbinfo->max_blocks - 1) >= 0)
+                                goto restart;
-                                return ERR_PTR(-ENOSPC);
+                        /*
-                        percpu_counter_inc(&sbinfo->used_blocks);
+                         * Otherwise, we must be storing a swap entry
-                        spin_lock(&inode->i_lock);
+                         * here as an exceptional entry: so return it
-                        inode->i_blocks += BLOCKS_PER_PAGE;
+                         * without attempting to raise page count.
-                        spin_unlock(&inode->i_lock);
+                         */
+                        goto export;
                }
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
-                spin_unlock(&info->lock);
+                /* Has the page moved? */
-                page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
+                if (unlikely(page != *((void **)pages[i]))) {
-                spin_lock(&info->lock);
+                        page_cache_release(page);
+                        goto repeat;
-                if (!page) {
-                        shmem_free_blocks(inode, 1);
-                        return ERR_PTR(-ENOMEM);
-                }
-                if (sgp != SGP_WRITE &&
-                    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
-                        entry = ERR_PTR(-EINVAL);
-                        break;
                }
-                if (info->next_index <= index)
+export:
-                        info->next_index = index + 1;
+                indices[ret] = indices[i];
-        }
+                pages[ret] = page;
-        if (page) {
+                ret++;
-                /* another task gave its page, or truncated the file */
+        }
-                shmem_free_blocks(inode, 1);
+        if (unlikely(!ret && nr_found))
-                shmem_dir_free(page);
+                goto restart;
-        }
+        rcu_read_unlock();
-        if (info->next_index <= index && !IS_ERR(entry))
+        return ret;
-                info->next_index = index + 1;
-        return entry;
 }
-/**
+/*
- * shmem_free_swp - free some swap entries in a directory
+ * Remove swap entry from radix tree, free the swap and its page cache.
- * @dir:        pointer to the directory
- * @edir:       pointer after last entry of the directory
- * @punch_lock: pointer to spinlock when needed for the holepunch case
 */
-static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
+static int shmem_free_swap(struct address_space *mapping,
-                                                spinlock_t *punch_lock)
+                           pgoff_t index, void *radswap)
-{
+{
-        spinlock_t *punch_unlock = NULL;
+        int error;
-        swp_entry_t *ptr;
-        int freed = 0;
+        spin_lock_irq(&mapping->tree_lock);
+        error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
-        for (ptr = dir; ptr < edir; ptr++) {
+        spin_unlock_irq(&mapping->tree_lock);
-                if (ptr->val) {
+        if (!error)
-                        if (unlikely(punch_lock)) {
+                free_swap_and_cache(radix_to_swp_entry(radswap));
-                                punch_unlock = punch_lock;
+        return error;
-                                punch_lock = NULL;
-                                spin_lock(punch_unlock);
-                                if (!ptr->val)
-                                        continue;
-                        }
-                        free_swap_and_cache(*ptr);
-                        *ptr = (swp_entry_t){0};
-                        freed++;
-                }
-        }
-        if (punch_unlock)
-                spin_unlock(punch_unlock);
-        return freed;
-}
-static int shmem_map_and_free_swp(struct page *subdir, int offset,
-                int limit, struct page ***dir, spinlock_t *punch_lock)
-{
-        swp_entry_t *ptr;
-        int freed = 0;
-        ptr = shmem_swp_map(subdir);
-        for (; offset < limit; offset += LATENCY_LIMIT) {
-                int size = limit - offset;
-                if (size > LATENCY_LIMIT)
-                        size = LATENCY_LIMIT;
-                freed += shmem_free_swp(ptr+offset, ptr+offset+size,
-                                                        punch_lock);
-                if (need_resched()) {
-                        shmem_swp_unmap(ptr);
-                        if (*dir) {
-                                shmem_dir_unmap(*dir);
-                                *dir = NULL;
-                        }
-                        cond_resched();
-                        ptr = shmem_swp_map(subdir);
-                }
-        }
-        shmem_swp_unmap(ptr);
-        return freed;
 }
-static void shmem_free_pages(struct list_head *next)
+/*
+ * Pagevec may contain swap entries, so shuffle up pages before releasing.
+ */
+static void shmem_pagevec_release(struct pagevec *pvec)
 {
-        struct page *page;
+        int i, j;
-        int freed = 0;
+        for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
-        do {
+                struct page *page = pvec->pages[i];
-                page = container_of(next, struct page, lru);
+                if (!radix_tree_exceptional_entry(page))
-                next = next->next;
+                        pvec->pages[j++] = page;
-                shmem_dir_free(page);
+        }
-                freed++;
+        pvec->nr = j;
-                if (freed >= LATENCY_LIMIT) {
+        pagevec_release(pvec);
-                        cond_resched();
-                        freed = 0;
-                }
-        } while (next);
 }
-void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+/*
+ * Remove range of pages and swap entries from radix tree, and free them.
+ */
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
+        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
-        unsigned long idx;
+        pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        unsigned long size;
+        unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
-        unsigned long limit;
+        pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
-        unsigned long stage;
+        struct pagevec pvec;
-        unsigned long diroff;
+        pgoff_t indices[PAGEVEC_SIZE];
-        struct page **dir;
-        struct page *topdir;
-        struct page *middir;
-        struct page *subdir;
-        swp_entry_t *ptr;
-        LIST_HEAD(pages_to_free);
-        long nr_pages_to_free = 0;
        long nr_swaps_freed = 0;
-        int offset;
+        pgoff_t index;
-        int freed;
+        int i;
-        int punch_hole;
-        spinlock_t *needs_lock;
-        spinlock_t *punch_lock;
-        unsigned long upper_limit;
-        truncate_inode_pages_range(inode->i_mapping, start, end);
+        BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
-        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        pagevec_init(&pvec, 0);
-        idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        index = start;
-        if (idx >= info->next_index)
+        while (index <= end) {
-                return;
+                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                                                        pvec.pages, indices);
+                if (!pvec.nr)
+                        break;
+                mem_cgroup_uncharge_start();
+                for (i = 0; i < pagevec_count(&pvec); i++) {
+                        struct page *page = pvec.pages[i];
-        spin_lock(&info->lock);
+                        index = indices[i];
-        info->flags |= SHMEM_TRUNCATE;
+                        if (index > end)
-        if (likely(end == (loff_t) -1)) {
+                                break;
-                limit = info->next_index;
-                upper_limit = SHMEM_MAX_INDEX;
-                info->next_index = idx;
-                needs_lock = NULL;
-                punch_hole = 0;
-        } else {
-                if (end + 1 >= inode->i_size) { /* we may free a little more */
-                        limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
-                                                        PAGE_CACHE_SHIFT;
-                        upper_limit = SHMEM_MAX_INDEX;
-                } else {
-                        limit = (end + 1) >> PAGE_CACHE_SHIFT;
-                        upper_limit = limit;
-                }
-                needs_lock = &info->lock;
-                punch_hole = 1;
-        }
-        topdir = info->i_indirect;
+                        if (radix_tree_exceptional_entry(page)) {
-        if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
+                                nr_swaps_freed += !shmem_free_swap(mapping,
-                info->i_indirect = NULL;
+                                                                index, page);
-                nr_pages_to_free++;
+                                continue;
-                list_add(&topdir->lru, &pages_to_free);
+                        }
+                        if (!trylock_page(page))
+                                continue;
+                        if (page->mapping == mapping) {
+                                VM_BUG_ON(PageWriteback(page));
+                                truncate_inode_page(mapping, page);
+                        }
+                        unlock_page(page);
+                }
+                shmem_pagevec_release(&pvec);
+                mem_cgroup_uncharge_end();
+                cond_resched();
+                index++;
        }
-        spin_unlock(&info->lock);
-        if (info->swapped && idx < SHMEM_NR_DIRECT) {
+        if (partial) {
-                ptr = info->i_direct;
+                struct page *page = NULL;
-                size = limit;
+                shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
-                if (size > SHMEM_NR_DIRECT)
+                if (page) {
-                        size = SHMEM_NR_DIRECT;
+                        zero_user_segment(page, partial, PAGE_CACHE_SIZE);
-                nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
+                        set_page_dirty(page);
+                        unlock_page(page);
+                        page_cache_release(page);
+                }
        }
-        /*
+        index = start;
-         * If there are no indirect blocks or we are punching a hole
+        for ( ; ; ) {
-         * below indirect blocks, nothing to be done.
+                cond_resched();
-         */
+                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-        if (!topdir || limit <= SHMEM_NR_DIRECT)
+                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
-                goto done2;
+                                                        pvec.pages, indices);
+                if (!pvec.nr) {
+                        if (index == start)
+                                break;
+                        index = start;
+                        continue;
+                }
+                if (index == start && indices[0] > end) {
+                        shmem_pagevec_release(&pvec);
+                        break;
+                }
+                mem_cgroup_uncharge_start();
+                for (i = 0; i < pagevec_count(&pvec); i++) {
+                        struct page *page = pvec.pages[i];
-        /*
+                        index = indices[i];
-         * The truncation case has already dropped info->lock, and we're safe
+                        if (index > end)
-         * because i_size and next_index have already been lowered, preventing
+                                break;
-         * access beyond.  But in the punch_hole case, we still need to take
-         * the lock when updating the swap directory, because there might be
-         * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
-         * shmem_writepage.  However, whenever we find we can remove a whole
-         * directory page (not at the misaligned start or end of the range),
-         * we first NULLify its pointer in the level above, and then have no
-         * need to take the lock when updating its contents: needs_lock and
-         * punch_lock (either pointing to info->lock or NULL) manage this.
-         */
-        upper_limit -= SHMEM_NR_DIRECT;
+                        if (radix_tree_exceptional_entry(page)) {
-        limit -= SHMEM_NR_DIRECT;
+                                nr_swaps_freed += !shmem_free_swap(mapping,
-        idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
+                                                                index, page);
-        offset = idx % ENTRIES_PER_PAGE;
+                                continue;
-        idx -= offset;
-        dir = shmem_dir_map(topdir);
-        stage = ENTRIES_PER_PAGEPAGE/2;
-        if (idx < ENTRIES_PER_PAGEPAGE/2) {
-                middir = topdir;
-                diroff = idx/ENTRIES_PER_PAGE;
-        } else {
-                dir += ENTRIES_PER_PAGE/2;
-                dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
-                while (stage <= idx)
-                        stage += ENTRIES_PER_PAGEPAGE;
-                middir = *dir;
-                if (*dir) {
-                        diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
-                                ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
-                        if (!diroff && !offset && upper_limit >= stage) {
-                                if (needs_lock) {
-                                        spin_lock(needs_lock);
-                                        *dir = NULL;
-                                        spin_unlock(needs_lock);
-                                        needs_lock = NULL;
-                                } else
-                                        *dir = NULL;
-                                nr_pages_to_free++;
-                                list_add(&middir->lru, &pages_to_free);
                        }
-                        shmem_dir_unmap(dir);
-                        dir = shmem_dir_map(middir);
-                } else {
-                        diroff = 0;
-                        offset = 0;
-                        idx = stage;
-                }
-        }
-        for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
+                        lock_page(page);
-                if (unlikely(idx == stage)) {
+                        if (page->mapping == mapping) {
-                        shmem_dir_unmap(dir);
+                                VM_BUG_ON(PageWriteback(page));
-                        dir = shmem_dir_map(topdir) +
+                                truncate_inode_page(mapping, page);
-                            ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
-                        while (!*dir) {
-                                dir++;
-                                idx += ENTRIES_PER_PAGEPAGE;
-                                if (idx >= limit)
-                                        goto done1;
                        }
-                        stage = idx + ENTRIES_PER_PAGEPAGE;
+                        unlock_page(page);
-                        middir = *dir;
-                        if (punch_hole)
-                                needs_lock = &info->lock;
-                        if (upper_limit >= stage) {
-                                if (needs_lock) {
-                                        spin_lock(needs_lock);
-                                        *dir = NULL;
-                                        spin_unlock(needs_lock);
-                                        needs_lock = NULL;
-                                } else
-                                        *dir = NULL;
-                                nr_pages_to_free++;
-                                list_add(&middir->lru, &pages_to_free);
-                        }
-                        shmem_dir_unmap(dir);
-                        cond_resched();
-                        dir = shmem_dir_map(middir);
-                        diroff = 0;
-                }
-                punch_lock = needs_lock;
-                subdir = dir[diroff];
-                if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
-                        if (needs_lock) {
-                                spin_lock(needs_lock);
-                                dir[diroff] = NULL;
-                                spin_unlock(needs_lock);
-                                punch_lock = NULL;
-                        } else
-                                dir[diroff] = NULL;
-                        nr_pages_to_free++;
-                        list_add(&subdir->lru, &pages_to_free);
-                }
-                if (subdir && page_private(subdir) /* has swap entries */) {
-                        size = limit - idx;
-                        if (size > ENTRIES_PER_PAGE)
-                                size = ENTRIES_PER_PAGE;
-                        freed = shmem_map_and_free_swp(subdir,
-                                        offset, size, &dir, punch_lock);
-                        if (!dir)
-                                dir = shmem_dir_map(middir);
-                        nr_swaps_freed += freed;
-                        if (offset || punch_lock) {
-                                spin_lock(&info->lock);
-                                set_page_private(subdir,
-                                        page_private(subdir) - freed);
-                                spin_unlock(&info->lock);
-                        } else
-                                BUG_ON(page_private(subdir) != freed);
                }
-                offset = 0;
+                shmem_pagevec_release(&pvec);
-        }
+                mem_cgroup_uncharge_end();
-done1:
+                index++;
-        shmem_dir_unmap(dir);
-done2:
-        if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
-                /*
-                 * Call truncate_inode_pages again: racing shmem_unuse_inode
-                 * may have swizzled a page in from swap since
-                 * truncate_pagecache or generic_delete_inode did it, before we
-                 * lowered next_index.  Also, though shmem_getpage checks
-                 * i_size before adding to cache, no recheck after: so fix the
-                 * narrow window there too.
-                 */
-                truncate_inode_pages_range(inode->i_mapping, start, end);
        }
        spin_lock(&info->lock);
-        info->flags &= ~SHMEM_TRUNCATE;
        info->swapped -= nr_swaps_freed;
-        if (nr_pages_to_free)
-                shmem_free_blocks(inode, nr_pages_to_free);
        shmem_recalc_inode(inode);
        spin_unlock(&info->lock);
-        /*
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-         * Empty swap vector directory pages to be freed?
-         */
-        if (!list_empty(&pages_to_free)) {
-                pages_to_free.prev->next = NULL;
-                shmem_free_pages(pages_to_free.next);
-        }
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -774,37 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
                loff_t oldsize = inode->i_size;
                loff_t newsize = attr->ia_size;
-                struct page *page = NULL;
-                if (newsize < oldsize) {
-                        /*
-                         * If truncating down to a partial page, then
-                         * if that page is already allocated, hold it
-                         * in memory until the truncation is over, so
-                         * truncate_partial_page cannot miss it were
-                         * it assigned to swap.
-                         */
-                        if (newsize & (PAGE_CACHE_SIZE-1)) {
-                                (void) shmem_getpage(inode,
-                                        newsize >> PAGE_CACHE_SHIFT,
-                                                &page, SGP_READ, NULL);
-                                if (page)
-                                        unlock_page(page);
-                        }
-                        /*
-                         * Reset SHMEM_PAGEIN flag so that shmem_truncate can
-                         * detect if any pages might have been added to cache
-                         * after truncate_inode_pages.  But we needn't bother
-                         * if it's being fully truncated to zero-length: the
-                         * nrpages check is efficient enough in that case.
-                         */
-                        if (newsize) {
-                                struct shmem_inode_info *info = SHMEM_I(inode);
-                                spin_lock(&info->lock);
-                                info->flags &= ~SHMEM_PAGEIN;
-                                spin_unlock(&info->lock);
-                        }
-                }
                if (newsize != oldsize) {
                        i_size_write(inode, newsize);
                        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -816,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
                        /* unmap again to remove racily COWed private pages */
                        unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
                }
-                if (page)
-                        page_cache_release(page);
        }
        setattr_copy(inode, attr);
@@ -842,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode)
                        list_del_init(&info->swaplist);
                        mutex_unlock(&shmem_swaplist_mutex);
                }
-        }
+        } else
+                kfree(info->symlink);
        list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
                kfree(xattr->name);
@@ -853,106 +568,27 @@ static void shmem_evict_inode(struct inode *inode)
        end_writeback(inode);
 }
-static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
+/*
-{
+ * If swap found in inode, free it and move page from swapcache to filecache.
-        swp_entry_t *ptr;
+ */
+static int shmem_unuse_inode(struct shmem_inode_info *info,
-        for (ptr = dir; ptr < edir; ptr++) {
+                             swp_entry_t swap, struct page *page)
-                if (ptr->val == entry.val)
-                        return ptr - dir;
-        }
-        return -1;
-}
-static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
 {
-        struct address_space *mapping;
+        struct address_space *mapping = info->vfs_inode.i_mapping;
-        unsigned long idx;
+        void *radswap;
-        unsigned long size;
+        pgoff_t index;
-        unsigned long limit;
-        unsigned long stage;
-        struct page **dir;
-        struct page *subdir;
-        swp_entry_t *ptr;
-        int offset;
        int error;
-        idx = 0;
+        radswap = swp_to_radix_entry(swap);
-        ptr = info->i_direct;
+        index = radix_tree_locate_item(&mapping->page_tree, radswap);
-        spin_lock(&info->lock);
+        if (index == -1)
-        if (!info->swapped) {
+                return 0;
-                list_del_init(&info->swaplist);
-                goto lost2;
-        }
-        limit = info->next_index;
-        size = limit;
-        if (size > SHMEM_NR_DIRECT)
-                size = SHMEM_NR_DIRECT;
-        offset = shmem_find_swp(entry, ptr, ptr+size);
-        if (offset >= 0) {
-                shmem_swp_balance_unmap();
-                goto found;
-        }
-        if (!info->i_indirect)
-                goto lost2;
-        dir = shmem_dir_map(info->i_indirect);
-        stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
-        for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
-                if (unlikely(idx == stage)) {
-                        shmem_dir_unmap(dir-1);
-                        if (cond_resched_lock(&info->lock)) {
-                                /* check it has not been truncated */
-                                if (limit > info->next_index) {
-                                        limit = info->next_index;
-                                        if (idx >= limit)
-                                                goto lost2;
-                                }
-                        }
-                        dir = shmem_dir_map(info->i_indirect) +
-                            ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
-                        while (!*dir) {
-                                dir++;
-                                idx += ENTRIES_PER_PAGEPAGE;
-                                if (idx >= limit)
-                                        goto lost1;
-                        }
-                        stage = idx + ENTRIES_PER_PAGEPAGE;
-                        subdir = *dir;
-                        shmem_dir_unmap(dir);
-                        dir = shmem_dir_map(subdir);
-                }
-                subdir = *dir;
-                if (subdir && page_private(subdir)) {
-                        ptr = shmem_swp_map(subdir);
-                        size = limit - idx;
-                        if (size > ENTRIES_PER_PAGE)
-                                size = ENTRIES_PER_PAGE;
-                        offset = shmem_find_swp(entry, ptr, ptr+size);
-                        shmem_swp_unmap(ptr);
-                        if (offset >= 0) {
-                                shmem_dir_unmap(dir);
-                                ptr = shmem_swp_map(subdir);
-                                goto found;
-                        }
-                }
-        }
-lost1:
-        shmem_dir_unmap(dir-1);
-lost2:
-        spin_unlock(&info->lock);
-        return 0;
-found:
-        idx += offset;
-        ptr += offset;
        /*
         * Move _head_ to start search for next from here.
         * But be careful: shmem_evict_inode checks list_empty without taking
         * mutex, and there's an instant in list_move_tail when info->swaplist
-         * would appear empty, if it were the only one on shmem_swaplist.  We
+         * would appear empty, if it were the only one on shmem_swaplist.
-         * could avoid doing it if inode NULL; or use this minor optimization.
         */
        if (shmem_swaplist.next != &info->swaplist)
                list_move_tail(&shmem_swaplist, &info->swaplist);
@@ -962,42 +598,34 @@ found:
         * but also to hold up shmem_evict_inode(): so inode cannot be freed
         * beneath us (pagelock doesn't help until the page is in pagecache).
         */
-        mapping = info->vfs_inode.i_mapping;
+        error = shmem_add_to_page_cache(page, mapping, index,
-        error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
+                                                GFP_NOWAIT, radswap);
        /* which does mem_cgroup_uncharge_cache_page on error */
-        if (error == -EEXIST) {
+        if (error != -ENOMEM) {
-                struct page *filepage = find_get_page(mapping, idx);
+                /*
-                error = 1;
+                 * Truncation and eviction use free_swap_and_cache(), which
-                if (filepage) {
+                 * only does trylock page: if we raced, best clean up here.
-                        /*
+                 */
-                         * There might be a more uptodate page coming down
-                         * from a stacked writepage: forget our swappage if so.
-                         */
-                        if (PageUptodate(filepage))
-                                error = 0;
-                        page_cache_release(filepage);
-                }
-        }
-        if (!error) {
                delete_from_swap_cache(page);
                set_page_dirty(page);
-                info->flags |= SHMEM_PAGEIN;
+                if (!error) {
-                shmem_swp_set(info, ptr, 0);
+                        spin_lock(&info->lock);
-                swap_free(entry);
+                        info->swapped--;
+                        spin_unlock(&info->lock);
+                        swap_free(swap);
+                }
                error = 1;      /* not an error, but entry was found */
        }
-        shmem_swp_unmap(ptr);
-        spin_unlock(&info->lock);
        return error;
 }
 /*
- * shmem_unuse() search for an eventually swapped out shmem page.
+ * Search through swapped inodes to find and replace swap by page.
 */
-int shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t swap, struct page *page)
 {
-        struct list_head *p, *next;
+        struct list_head *this, *next;
        struct shmem_inode_info *info;
        int found = 0;
        int error;
@@ -1006,32 +634,25 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
         * Charge page using GFP_KERNEL while we can wait, before taking
         * the shmem_swaplist_mutex which might hold up shmem_writepage().
         * Charged back to the user (not to caller) when swap account is used.
-         * add_to_page_cache() will be called with GFP_NOWAIT.
         */
        error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
        if (error)
                goto out;
-        /*
+        /* No radix_tree_preload: swap entry keeps a place for page in tree */
-         * Try to preload while we can wait, to not make a habit of
-         * draining atomic reserves; but don't latch on to this cpu,
-         * it's okay if sometimes we get rescheduled after this.
-         */
-        error = radix_tree_preload(GFP_KERNEL);
-        if (error)
-                goto uncharge;
-        radix_tree_preload_end();
        mutex_lock(&shmem_swaplist_mutex);
-        list_for_each_safe(p, next, &shmem_swaplist) {
+        list_for_each_safe(this, next, &shmem_swaplist) {
-                info = list_entry(p, struct shmem_inode_info, swaplist);
+                info = list_entry(this, struct shmem_inode_info, swaplist);
-                found = shmem_unuse_inode(info, entry, page);
+                if (info->swapped)
+                        found = shmem_unuse_inode(info, swap, page);
+                else
+                        list_del_init(&info->swaplist);
                cond_resched();
                if (found)
                        break;
        }
        mutex_unlock(&shmem_swaplist_mutex);
-uncharge:
        if (!found)
                mem_cgroup_uncharge_cache_page(page);
        if (found < 0)
@@ -1048,10 +669,10 @@ out:
 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct shmem_inode_info *info;
-        swp_entry_t *entry, swap;
        struct address_space *mapping;
-        unsigned long index;
        struct inode *inode;
+        swp_entry_t swap;
+        pgoff_t index;
        BUG_ON(!PageLocked(page));
        mapping = page->mapping;
@@ -1066,69 +687,46 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
        /*
         * shmem_backing_dev_info's capabilities prevent regular writeback or
         * sync from ever calling shmem_writepage; but a stacking filesystem
-         * may use the ->writepage of its underlying filesystem, in which case
+         * might use ->writepage of its underlying filesystem, in which case
         * tmpfs should write out to swap only in response to memory pressure,
-         * and not for the writeback threads or sync.  However, in those cases,
+         * and not for the writeback threads or sync.
-         * we do still want to check if there's a redundant swappage to be
-         * discarded.
         */
-        if (wbc->for_reclaim)
+        if (!wbc->for_reclaim) {
-                swap = get_swap_page();
+                WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
-        else
+                goto redirty;
-                swap.val = 0;
+        }
+        swap = get_swap_page();
+        if (!swap.val)
+                goto redirty;
        /*
         * Add inode to shmem_unuse()'s list of swapped-out inodes,
-         * if it's not already there.  Do it now because we cannot take
+         * if it's not already there.  Do it now before the page is
-         * mutex while holding spinlock, and must do so before the page
+         * moved to swap cache, when its pagelock no longer protects
-         * is moved to swap cache, when its pagelock no longer protects
         * the inode from eviction.  But don't unlock the mutex until
-         * we've taken the spinlock, because shmem_unuse_inode() will
+         * we've incremented swapped, because shmem_unuse_inode() will
-         * prune a !swapped inode from the swaplist under both locks.
+         * prune a !swapped inode from the swaplist under this mutex.
         */
-        if (swap.val) {
+        mutex_lock(&shmem_swaplist_mutex);
-                mutex_lock(&shmem_swaplist_mutex);
+        if (list_empty(&info->swaplist))
-                if (list_empty(&info->swaplist))
+                list_add_tail(&info->swaplist, &shmem_swaplist);
-                        list_add_tail(&info->swaplist, &shmem_swaplist);
-        }
-        spin_lock(&info->lock);
-        if (swap.val)
-                mutex_unlock(&shmem_swaplist_mutex);
-        if (index >= info->next_index) {
-                BUG_ON(!(info->flags & SHMEM_TRUNCATE));
-                goto unlock;
-        }
-        entry = shmem_swp_entry(info, index, NULL);
-        if (entry->val) {
-                /*
-                 * The more uptodate page coming down from a stacked
-                 * writepage should replace our old swappage.
-                 */
-                free_swap_and_cache(*entry);
-                shmem_swp_set(info, entry, 0);
-        }
-        shmem_recalc_inode(inode);
-        if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+        if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
-                delete_from_page_cache(page);
-                shmem_swp_set(info, entry, swap.val);
-                shmem_swp_unmap(entry);
                swap_shmem_alloc(swap);
+                shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+                spin_lock(&info->lock);
+                info->swapped++;
+                shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+                mutex_unlock(&shmem_swaplist_mutex);
                BUG_ON(page_mapped(page));
                swap_writepage(page, wbc);
                return 0;
        }
-        shmem_swp_unmap(entry);
+        mutex_unlock(&shmem_swaplist_mutex);
-unlock:
-        spin_unlock(&info->lock);
-        /*
-         * add_to_swap_cache() doesn't return -EEXIST, so we can safely
-         * clear SWAP_HAS_CACHE flag.
-         */
        swapcache_free(swap, NULL);
 redirty:
        set_page_dirty(page);
@@ -1165,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 }
 #endif /* CONFIG_TMPFS */
-static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
+static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
-                        struct shmem_inode_info *info, unsigned long idx)
+                        struct shmem_inode_info *info, pgoff_t index)
 {
        struct mempolicy mpol, *spol;
        struct vm_area_struct pvma;
-        struct page *page;
        spol = mpol_cond_copy(&mpol,
-                                mpol_shared_policy_lookup(&info->policy, idx));
+                        mpol_shared_policy_lookup(&info->policy, index));
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
-        pvma.vm_pgoff = idx;
+        pvma.vm_pgoff = index;
        pvma.vm_ops = NULL;
        pvma.vm_policy = spol;
-        page = swapin_readahead(entry, gfp, &pvma, 0);
+        return swapin_readahead(swap, gfp, &pvma, 0);
-        return page;
 }
 static struct page *shmem_alloc_page(gfp_t gfp,
-                        struct shmem_inode_info *info, unsigned long idx)
+                        struct shmem_inode_info *info, pgoff_t index)
 {
        struct vm_area_struct pvma;
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
-        pvma.vm_pgoff = idx;
+        pvma.vm_pgoff = index;
        pvma.vm_ops = NULL;
-        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
        /*
         * alloc_page_vma() will drop the shared policy reference
@@ -1202,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
-static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
 {
 }
 #endif /* CONFIG_TMPFS */
-static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
+static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
-                        struct shmem_inode_info *info, unsigned long idx)
+                        struct shmem_inode_info *info, pgoff_t index)
 {
-        return swapin_readahead(entry, gfp, NULL, 0);
+        return swapin_readahead(swap, gfp, NULL, 0);
 }
 static inline struct page *shmem_alloc_page(gfp_t gfp,
-                        struct shmem_inode_info *info, unsigned long idx)
+                        struct shmem_inode_info *info, pgoff_t index)
 {
        return alloc_page(gfp);
 }
@@ -1228,311 +824,195 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 #endif
 /*
- * shmem_getpage - either get the page from swap or allocate a new one
+ * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
 *
 * If we allocate a new one we do not mark it dirty. That's up to the
 * vm. If we swap it in we mark it dirty since we also free the swap
 * entry since a page cannot live in both the swap and page cache
 */
-static int shmem_getpage(struct inode *inode, unsigned long idx,
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
-                        struct page **pagep, enum sgp_type sgp, int *type)
+        struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
 {
        struct address_space *mapping = inode->i_mapping;
-        struct shmem_inode_info *info = SHMEM_I(inode);
+        struct shmem_inode_info *info;
        struct shmem_sb_info *sbinfo;
-        struct page *filepage = *pagep;
+        struct page *page;
-        struct page *swappage;
-        struct page *prealloc_page = NULL;
-        swp_entry_t *entry;
        swp_entry_t swap;
-        gfp_t gfp;
        int error;
+        int once = 0;
-        if (idx >= SHMEM_MAX_INDEX)
+        if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
                return -EFBIG;
+repeat:
+        swap.val = 0;
+        page = find_lock_page(mapping, index);
+        if (radix_tree_exceptional_entry(page)) {
+                swap = radix_to_swp_entry(page);
+                page = NULL;
+        }
-        if (type)
+        if (sgp != SGP_WRITE &&
-                *type = 0;
+            ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+                error = -EINVAL;
+                goto failed;
+        }
-        /*
+        if (page || (sgp == SGP_READ && !swap.val)) {
-         * Normally, filepage is NULL on entry, and either found
-         * uptodate immediately, or allocated and zeroed, or read
-         * in under swappage, which is then assigned to filepage.
-         * But shmem_readpage (required for splice) passes in a locked
-         * filepage, which may be found not uptodate by other callers
-         * too, and may need to be copied from the swappage read in.
-         */
-repeat:
-        if (!filepage)
-                filepage = find_lock_page(mapping, idx);
-        if (filepage && PageUptodate(filepage))
-                goto done;
-        gfp = mapping_gfp_mask(mapping);
-        if (!filepage) {
                /*
-                 * Try to preload while we can wait, to not make a habit of
+                 * Once we can get the page lock, it must be uptodate:
-                 * draining atomic reserves; but don't latch on to this cpu.
+                 * if there were an error in reading back from swap,
+                 * the page would not be inserted into the filecache.
                 */
-                error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+                BUG_ON(page && !PageUptodate(page));
-                if (error)
+                *pagep = page;
-                        goto failed;
+                return 0;
-                radix_tree_preload_end();
-                if (sgp != SGP_READ && !prealloc_page) {
-                        /* We don't care if this fails */
-                        prealloc_page = shmem_alloc_page(gfp, info, idx);
-                        if (prealloc_page) {
-                                if (mem_cgroup_cache_charge(prealloc_page,
-                                                current->mm, GFP_KERNEL)) {
-                                        page_cache_release(prealloc_page);
-                                        prealloc_page = NULL;
-                                }
-                        }
-                }
        }
-        error = 0;
-        spin_lock(&info->lock);
+        /*
-        shmem_recalc_inode(inode);
+         * Fast cache lookup did not find it:
-        entry = shmem_swp_alloc(info, idx, sgp);
+         * bring it back from swap or allocate.
-        if (IS_ERR(entry)) {
+         */
-                spin_unlock(&info->lock);
+        info = SHMEM_I(inode);
-                error = PTR_ERR(entry);
+        sbinfo = SHMEM_SB(inode->i_sb);
-                goto failed;
-        }
-        swap = *entry;
        if (swap.val) {
                /* Look it up and read it in.. */
-                swappage = lookup_swap_cache(swap);
+                page = lookup_swap_cache(swap);
-                if (!swappage) {
+                if (!page) {
-                        shmem_swp_unmap(entry);
-                        spin_unlock(&info->lock);
                        /* here we actually do the io */
-                        if (type)
+                        if (fault_type)
-                                *type |= VM_FAULT_MAJOR;
+                                *fault_type |= VM_FAULT_MAJOR;
-                        swappage = shmem_swapin(swap, gfp, info, idx);
+                        page = shmem_swapin(swap, gfp, info, index);
-                        if (!swappage) {
+                        if (!page) {
-                                spin_lock(&info->lock);
+                                error = -ENOMEM;
-                                entry = shmem_swp_alloc(info, idx, sgp);
+                                goto failed;
-                                if (IS_ERR(entry))
-                                        error = PTR_ERR(entry);
-                                else {
-                                        if (entry->val == swap.val)
-                                                error = -ENOMEM;
-                                        shmem_swp_unmap(entry);
-                                }
-                                spin_unlock(&info->lock);
-                                if (error)
-                                        goto failed;
-                                goto repeat;
                        }
-                        wait_on_page_locked(swappage);
-                        page_cache_release(swappage);
-                        goto repeat;
                }
                /* We have to do this with page locked to prevent races */
-                if (!trylock_page(swappage)) {
+                lock_page(page);
-                        shmem_swp_unmap(entry);
+                if (!PageUptodate(page)) {
-                        spin_unlock(&info->lock);
-                        wait_on_page_locked(swappage);
-                        page_cache_release(swappage);
-                        goto repeat;
-                }
-                if (PageWriteback(swappage)) {
-                        shmem_swp_unmap(entry);
-                        spin_unlock(&info->lock);
-                        wait_on_page_writeback(swappage);
-                        unlock_page(swappage);
-                        page_cache_release(swappage);
-                        goto repeat;
-                }
-                if (!PageUptodate(swappage)) {
-                        shmem_swp_unmap(entry);
-                        spin_unlock(&info->lock);
-                        unlock_page(swappage);
-                        page_cache_release(swappage);
                        error = -EIO;
                        goto failed;
                }
+                wait_on_page_writeback(page);
-                if (filepage) {
-                        shmem_swp_set(info, entry, 0);
+                /* Someone may have already done it for us */
-                        shmem_swp_unmap(entry);
+                if (page->mapping) {
-                        delete_from_swap_cache(swappage);
+                        if (page->mapping == mapping &&
-                        spin_unlock(&info->lock);
+                            page->index == index)
-                        copy_highpage(filepage, swappage);
+                                goto done;
-                        unlock_page(swappage);
+                        error = -EEXIST;
-                        page_cache_release(swappage);
+                        goto failed;
-                        flush_dcache_page(filepage);
-                        SetPageUptodate(filepage);
-                        set_page_dirty(filepage);
-                        swap_free(swap);
-                } else if (!(error = add_to_page_cache_locked(swappage, mapping,
-                                        idx, GFP_NOWAIT))) {
-                        info->flags |= SHMEM_PAGEIN;
-                        shmem_swp_set(info, entry, 0);
-                        shmem_swp_unmap(entry);
-                        delete_from_swap_cache(swappage);
-                        spin_unlock(&info->lock);
-                        filepage = swappage;
-                        set_page_dirty(filepage);
-                        swap_free(swap);
-                } else {
-                        shmem_swp_unmap(entry);
-                        spin_unlock(&info->lock);
-                        if (error == -ENOMEM) {
-                                /*
-                                 * reclaim from proper memory cgroup and
-                                 * call memcg's OOM if needed.
-                                 */
-                                error = mem_cgroup_shmem_charge_fallback(
-                                                                swappage,
-                                                                current->mm,
-                                                                gfp);
-                                if (error) {
-                                        unlock_page(swappage);
-                                        page_cache_release(swappage);
-                                        goto failed;
-                                }
-                        }
-                        unlock_page(swappage);
-                        page_cache_release(swappage);
-                        goto repeat;
-                }
-        } else if (sgp == SGP_READ && !filepage) {
-                shmem_swp_unmap(entry);
-                filepage = find_get_page(mapping, idx);
-                if (filepage &&
-                    (!PageUptodate(filepage) || !trylock_page(filepage))) {
-                        spin_unlock(&info->lock);
-                        wait_on_page_locked(filepage);
-                        page_cache_release(filepage);
-                        filepage = NULL;
-                        goto repeat;
                }
+                error = mem_cgroup_cache_charge(page, current->mm,
+                                                gfp & GFP_RECLAIM_MASK);
+                if (!error)
+                        error = shmem_add_to_page_cache(page, mapping, index,
+                                                gfp, swp_to_radix_entry(swap));
+                if (error)
+                        goto failed;
+                spin_lock(&info->lock);
+                info->swapped--;
+                shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+                delete_from_swap_cache(page);
+                set_page_dirty(page);
+                swap_free(swap);
        } else {
-                shmem_swp_unmap(entry);
+                if (shmem_acct_block(info->flags)) {
-                sbinfo = SHMEM_SB(inode->i_sb);
+                        error = -ENOSPC;
+                        goto failed;
+                }
                if (sbinfo->max_blocks) {
                        if (percpu_counter_compare(&sbinfo->used_blocks,
-                                                sbinfo->max_blocks) >= 0 ||
+                                                sbinfo->max_blocks) >= 0) {
-                            shmem_acct_block(info->flags))
+                                error = -ENOSPC;
-                                goto nospace;
+                                goto unacct;
-                        percpu_counter_inc(&sbinfo->used_blocks);
-                        spin_lock(&inode->i_lock);
-                        inode->i_blocks += BLOCKS_PER_PAGE;
-                        spin_unlock(&inode->i_lock);
-                } else if (shmem_acct_block(info->flags))
-                        goto nospace;
-                if (!filepage) {
-                        int ret;
-                        if (!prealloc_page) {
-                                spin_unlock(&info->lock);
-                                filepage = shmem_alloc_page(gfp, info, idx);
-                                if (!filepage) {
-                                        shmem_unacct_blocks(info->flags, 1);
-                                        shmem_free_blocks(inode, 1);
-                                        error = -ENOMEM;
-                                        goto failed;
-                                }
-                                SetPageSwapBacked(filepage);
-                                /*
-                                 * Precharge page while we can wait, compensate
-                                 * after
-                                 */
-                                error = mem_cgroup_cache_charge(filepage,
-                                        current->mm, GFP_KERNEL);
-                                if (error) {
-                                        page_cache_release(filepage);
-                                        shmem_unacct_blocks(info->flags, 1);
-                                        shmem_free_blocks(inode, 1);
-                                        filepage = NULL;
-                                        goto failed;
-                                }
-                                spin_lock(&info->lock);
-                        } else {
-                                filepage = prealloc_page;
-                                prealloc_page = NULL;
-                                SetPageSwapBacked(filepage);
                        }
+                        percpu_counter_inc(&sbinfo->used_blocks);
+                }
-                        entry = shmem_swp_alloc(info, idx, sgp);
+                page = shmem_alloc_page(gfp, info, index);
-                        if (IS_ERR(entry))
+                if (!page) {
-                                error = PTR_ERR(entry);
+                        error = -ENOMEM;
-                        else {
+                        goto decused;
-                                swap = *entry;
-                                shmem_swp_unmap(entry);
-                        }
-                        ret = error || swap.val;
-                        if (ret)
-                                mem_cgroup_uncharge_cache_page(filepage);
-                        else
-                                ret = add_to_page_cache_lru(filepage, mapping,
-                                                idx, GFP_NOWAIT);
-                        /*
-                         * At add_to_page_cache_lru() failure, uncharge will
-                         * be done automatically.
-                         */
-                        if (ret) {
-                                spin_unlock(&info->lock);
-                                page_cache_release(filepage);
-                                shmem_unacct_blocks(info->flags, 1);
-                                shmem_free_blocks(inode, 1);
-                                filepage = NULL;
-                                if (error)
-                                        goto failed;
-                                goto repeat;
-                        }
-                        info->flags |= SHMEM_PAGEIN;
                }
+                SetPageSwapBacked(page);
+                __set_page_locked(page);
+                error = mem_cgroup_cache_charge(page, current->mm,
+                                                gfp & GFP_RECLAIM_MASK);
+                if (!error)
+                        error = shmem_add_to_page_cache(page, mapping, index,
+                                                gfp, NULL);
+                if (error)
+                        goto decused;
+                lru_cache_add_anon(page);
+                spin_lock(&info->lock);
                info->alloced++;
+                inode->i_blocks += BLOCKS_PER_PAGE;
+                shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
-                clear_highpage(filepage);
-                flush_dcache_page(filepage);
+                clear_highpage(page);
-                SetPageUptodate(filepage);
+                flush_dcache_page(page);
+                SetPageUptodate(page);
                if (sgp == SGP_DIRTY)
-                        set_page_dirty(filepage);
+                        set_page_dirty(page);
        }
 done:
-        *pagep = filepage;
+        /* Perhaps the file has been truncated since we checked */
-        error = 0;
+        if (sgp != SGP_WRITE &&
-        goto out;
+            ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+                error = -EINVAL;
+                goto trunc;
+        }
+        *pagep = page;
+        return 0;
-nospace:
        /*
-         * Perhaps the page was brought in from swap between find_lock_page
+         * Error recovery.
-         * and taking info->lock?  We allow for that at add_to_page_cache_lru,
-         * but must also avoid reporting a spurious ENOSPC while working on a
-         * full tmpfs.  (When filepage has been passed in to shmem_getpage, it
-         * is already in page cache, which prevents this race from occurring.)
         */
-        if (!filepage) {
+trunc:
-                struct page *page = find_get_page(mapping, idx);
+        ClearPageDirty(page);
-                if (page) {
+        delete_from_page_cache(page);
-                        spin_unlock(&info->lock);
+        spin_lock(&info->lock);
-                        page_cache_release(page);
+        info->alloced--;
-                        goto repeat;
+        inode->i_blocks -= BLOCKS_PER_PAGE;
-                }
-        }
        spin_unlock(&info->lock);
-        error = -ENOSPC;
+decused:
+        if (sbinfo->max_blocks)
+                percpu_counter_add(&sbinfo->used_blocks, -1);
+unacct:
+        shmem_unacct_blocks(info->flags, 1);
 failed:
-        if (*pagep != filepage) {
+        if (swap.val && error != -EINVAL) {
-                unlock_page(filepage);
+                struct page *test = find_get_page(mapping, index);
-                page_cache_release(filepage);
+                if (test && !radix_tree_exceptional_entry(test))
+                        page_cache_release(test);
+                /* Have another try if the entry has changed */
+                if (test != swp_to_radix_entry(swap))
+                        error = -EEXIST;
        }
-out:
+        if (page) {
-        if (prealloc_page) {
+                unlock_page(page);
-                mem_cgroup_uncharge_cache_page(prealloc_page);
+                page_cache_release(page);
-                page_cache_release(prealloc_page);
        }
+        if (error == -ENOSPC && !once++) {
+                info = SHMEM_I(inode);
+                spin_lock(&info->lock);
+                shmem_recalc_inode(inode);
+                spin_unlock(&info->lock);
+                goto repeat;
+        }
+        if (error == -EEXIST)
+                goto repeat;
        return error;
 }
@@ -1540,36 +1020,34 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        int error;
-        int ret;
+        int ret = VM_FAULT_LOCKED;
-        if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-                return VM_FAULT_SIGBUS;
        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
        if (error)
                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
        if (ret & VM_FAULT_MAJOR) {
                count_vm_event(PGMAJFAULT);
                mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
        }
-        return ret | VM_FAULT_LOCKED;
+        return ret;
 }
 #ifdef CONFIG_NUMA
-static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
 {
-        struct inode *i = vma->vm_file->f_path.dentry->d_inode;
+        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
-        return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
+        return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
 }
 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
                                          unsigned long addr)
 {
-        struct inode *i = vma->vm_file->f_path.dentry->d_inode;
+        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
-        unsigned long idx;
+        pgoff_t index;
-        idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+        index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-        return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
+        return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
 }
 #endif
@@ -1667,20 +1145,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 #ifdef CONFIG_TMPFS
 static const struct inode_operations shmem_symlink_inode_operations;
-static const struct inode_operations shmem_symlink_inline_operations;
+static const struct inode_operations shmem_short_symlink_operations;
-/*
- * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
- * but providing them allows a tmpfs file to be used for splice, sendfile, and
- * below the loop driver, in the generic fashion that many filesystems support.
- */
-static int shmem_readpage(struct file *file, struct page *page)
-{
-        struct inode *inode = page->mapping->host;
-        int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
-        unlock_page(page);
-        return error;
-}
 static int
 shmem_write_begin(struct file *file, struct address_space *mapping,
@@ -1689,7 +1154,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
 {
        struct inode *inode = mapping->host;
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = NULL;
        return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
 }
@@ -1714,7 +1178,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
-        unsigned long index, offset;
+        pgoff_t index;
+        unsigned long offset;
        enum sgp_type sgp = SGP_READ;
        /*
@@ -1730,7 +1195,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
        for (;;) {
                struct page *page = NULL;
-                unsigned long end_index, nr, ret;
+                pgoff_t end_index;
+                unsigned long nr, ret;
                loff_t i_size = i_size_read(inode);
                end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -1846,6 +1312,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb,
        return retval;
 }
+static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
+                                struct pipe_inode_info *pipe, size_t len,
+                                unsigned int flags)
+{
+        struct address_space *mapping = in->f_mapping;
+        struct inode *inode = mapping->host;
+        unsigned int loff, nr_pages, req_pages;
+        struct page *pages[PIPE_DEF_BUFFERS];
+        struct partial_page partial[PIPE_DEF_BUFFERS];
+        struct page *page;
+        pgoff_t index, end_index;
+        loff_t isize, left;
+        int error, page_nr;
+        struct splice_pipe_desc spd = {
+                .pages = pages,
+                .partial = partial,
+                .flags = flags,
+                .ops = &page_cache_pipe_buf_ops,
+                .spd_release = spd_release_page,
+        };
+        isize = i_size_read(inode);
+        if (unlikely(*ppos >= isize))
+                return 0;
+        left = isize - *ppos;
+        if (unlikely(left < len))
+                len = left;
+        if (splice_grow_spd(pipe, &spd))
+                return -ENOMEM;
+        index = *ppos >> PAGE_CACHE_SHIFT;
+        loff = *ppos & ~PAGE_CACHE_MASK;
+        req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        nr_pages = min(req_pages, pipe->buffers);
+        spd.nr_pages = find_get_pages_contig(mapping, index,
+                                                nr_pages, spd.pages);
+        index += spd.nr_pages;
+        error = 0;
+        while (spd.nr_pages < nr_pages) {
+                error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
+                if (error)
+                        break;
+                unlock_page(page);
+                spd.pages[spd.nr_pages++] = page;
+                index++;
+        }
+        index = *ppos >> PAGE_CACHE_SHIFT;
+        nr_pages = spd.nr_pages;
+        spd.nr_pages = 0;
+        for (page_nr = 0; page_nr < nr_pages; page_nr++) {
+                unsigned int this_len;
+                if (!len)
+                        break;
+                this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+                page = spd.pages[page_nr];
+                if (!PageUptodate(page) || page->mapping != mapping) {
+                        error = shmem_getpage(inode, index, &page,
+                                                        SGP_CACHE, NULL);
+                        if (error)
+                                break;
+                        unlock_page(page);
+                        page_cache_release(spd.pages[page_nr]);
+                        spd.pages[page_nr] = page;
+                }
+                isize = i_size_read(inode);
+                end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+                if (unlikely(!isize || index > end_index))
+                        break;
+                if (end_index == index) {
+                        unsigned int plen;
+                        plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+                        if (plen <= loff)
+                                break;
+                        this_len = min(this_len, plen - loff);
+                        len = this_len;
+                }
+                spd.partial[page_nr].offset = loff;
+                spd.partial[page_nr].len = this_len;
+                len -= this_len;
+                loff = 0;
+                spd.nr_pages++;
+                index++;
+        }
+        while (page_nr < nr_pages)
+                page_cache_release(spd.pages[page_nr++]);
+        if (spd.nr_pages)
+                error = splice_to_pipe(pipe, &spd);
+        splice_shrink_spd(pipe, &spd);
+        if (error > 0) {
+                *ppos += error;
+                file_accessed(in);
+        }
+        return error;
+}
 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1855,8 +1434,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_namelen = NAME_MAX;
        if (sbinfo->max_blocks) {
                buf->f_blocks = sbinfo->max_blocks;
-                buf->f_bavail = buf->f_bfree =
+                buf->f_bavail =
-                                sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
+                buf->f_bfree  = sbinfo->max_blocks -
+                                percpu_counter_sum(&sbinfo->used_blocks);
        }
        if (sbinfo->max_inodes) {
                buf->f_files = sbinfo->max_inodes;
@@ -2006,7 +1586,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        int error;
        int len;
        struct inode *inode;
-        struct page *page = NULL;
+        struct page *page;
        char *kaddr;
        struct shmem_inode_info *info;
@@ -2030,10 +1610,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        info = SHMEM_I(inode);
        inode->i_size = len-1;
-        if (len <= SHMEM_SYMLINK_INLINE_LEN) {
+        if (len <= SHORT_SYMLINK_LEN) {
-                /* do it inline */
+                info->symlink = kmemdup(symname, len, GFP_KERNEL);
-                memcpy(info->inline_symlink, symname, len);
+                if (!info->symlink) {
-                inode->i_op = &shmem_symlink_inline_operations;
+                        iput(inode);
+                        return -ENOMEM;
+                }
+                inode->i_op = &shmem_short_symlink_operations;
        } else {
                error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
                if (error) {
@@ -2056,17 +1639,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        return 0;
 }
-static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
+static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
 {
-        nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
+        nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
        return NULL;
 }
 static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        struct page *page = NULL;
-        int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
+        int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
-        nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
+        nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
        if (page)
                unlock_page(page);
        return page;
@@ -2177,7 +1760,6 @@ out:
        return err;
 }
 static const struct xattr_handler *shmem_xattr_handlers[] = {
 #ifdef CONFIG_TMPFS_POSIX_ACL
        &generic_acl_access_handler,
@@ -2307,9 +1889,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
 }
 #endif /* CONFIG_TMPFS_XATTR */
-static const struct inode_operations shmem_symlink_inline_operations = {
+static const struct inode_operations shmem_short_symlink_operations = {
        .readlink       = generic_readlink,
-        .follow_link    = shmem_follow_link_inline,
+        .follow_link    = shmem_follow_short_symlink,
 #ifdef CONFIG_TMPFS_XATTR
        .setxattr       = shmem_setxattr,
        .getxattr       = shmem_getxattr,
@@ -2509,8 +2091,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        if (config.max_inodes < inodes)
                goto out;
        /*
-         * Those tests also disallow limited->unlimited while any are in
+         * Those tests disallow limited->unlimited while any are in use;
-         * use, so i_blocks will always be zero when max_blocks is zero;
         * but we must separately disallow unlimited->limited, because
         * in that case we have no record of how much is already in use.
         */
@@ -2602,7 +2183,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
                goto failed;
        sbinfo->free_inodes = sbinfo->max_inodes;
-        sb->s_maxbytes = SHMEM_MAX_BYTES;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize = PAGE_CACHE_SIZE;
        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
        sb->s_magic = TMPFS_MAGIC;
@@ -2637,14 +2218,14 @@ static struct kmem_cache *shmem_inode_cachep;
 static struct inode *shmem_alloc_inode(struct super_block *sb)
 {
-        struct shmem_inode_info *p;
+        struct shmem_inode_info *info;
-        p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
+        info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
-        if (!p)
+        if (!info)
                return NULL;
-        return &p->vfs_inode;
+        return &info->vfs_inode;
 }
-static void shmem_i_callback(struct rcu_head *head)
+static void shmem_destroy_callback(struct rcu_head *head)
 {
        struct inode *inode = container_of(head, struct inode, i_rcu);
        INIT_LIST_HEAD(&inode->i_dentry);
@@ -2653,29 +2234,26 @@ static void shmem_i_callback(struct rcu_head *head)
 static void shmem_destroy_inode(struct inode *inode)
 {
-        if ((inode->i_mode & S_IFMT) == S_IFREG) {
+        if ((inode->i_mode & S_IFMT) == S_IFREG)
-                /* only struct inode is valid if it's an inline symlink */
                mpol_free_shared_policy(&SHMEM_I(inode)->policy);
-        }
+        call_rcu(&inode->i_rcu, shmem_destroy_callback);
-        call_rcu(&inode->i_rcu, shmem_i_callback);
 }
-static void init_once(void *foo)
+static void shmem_init_inode(void *foo)
 {
-        struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
+        struct shmem_inode_info *info = foo;
+        inode_init_once(&info->vfs_inode);
-        inode_init_once(&p->vfs_inode);
 }
-static int init_inodecache(void)
+static int shmem_init_inodecache(void)
 {
        shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
                                sizeof(struct shmem_inode_info),
-                                0, SLAB_PANIC, init_once);
+                                0, SLAB_PANIC, shmem_init_inode);
        return 0;
 }
-static void destroy_inodecache(void)
+static void shmem_destroy_inodecache(void)
 {
        kmem_cache_destroy(shmem_inode_cachep);
 }
@@ -2684,7 +2262,6 @@ static const struct address_space_operations shmem_aops = {
        .writepage      = shmem_writepage,
        .set_page_dirty = __set_page_dirty_no_writeback,
 #ifdef CONFIG_TMPFS
-        .readpage       = shmem_readpage,
        .write_begin    = shmem_write_begin,
        .write_end      = shmem_write_end,
 #endif
@@ -2701,7 +2278,7 @@ static const struct file_operations shmem_file_operations = {
        .aio_read       = shmem_file_aio_read,
        .aio_write      = generic_file_aio_write,
        .fsync          = noop_fsync,
-        .splice_read    = generic_file_splice_read,
+        .splice_read    = shmem_file_splice_read,
        .splice_write   = generic_file_splice_write,
 #endif
 };
@@ -2715,10 +2292,6 @@ static const struct inode_operations shmem_inode_operations = {
        .listxattr      = shmem_listxattr,
        .removexattr    = shmem_removexattr,
 #endif
-#ifdef CONFIG_TMPFS_POSIX_ACL
-        .check_acl      = generic_check_acl,
-#endif
 };
 static const struct inode_operations shmem_dir_inode_operations = {
@@ -2741,7 +2314,6 @@ static const struct inode_operations shmem_dir_inode_operations = {
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_setattr,
-        .check_acl      = generic_check_acl,
 #endif
 };
@@ -2754,7 +2326,6 @@ static const struct inode_operations shmem_special_inode_operations = {
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_setattr,
-        .check_acl      = generic_check_acl,
 #endif
 };
@@ -2779,21 +2350,20 @@ static const struct vm_operations_struct shmem_vm_ops = {
 #endif
 };
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
        return mount_nodev(fs_type, flags, data, shmem_fill_super);
 }
-static struct file_system_type tmpfs_fs_type = {
+static struct file_system_type shmem_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "tmpfs",
        .mount          = shmem_mount,
        .kill_sb        = kill_litter_super,
 };
-int __init init_tmpfs(void)
+int __init shmem_init(void)
 {
        int error;
@@ -2801,18 +2371,18 @@ int __init init_tmpfs(void)
        if (error)
                goto out4;
-        error = init_inodecache();
+        error = shmem_init_inodecache();
        if (error)
                goto out3;
-        error = register_filesystem(&tmpfs_fs_type);
+        error = register_filesystem(&shmem_fs_type);
        if (error) {
                printk(KERN_ERR "Could not register tmpfs\n");
                goto out2;
        }
-        shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
+        shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
-                                tmpfs_fs_type.name, NULL);
+                                 shmem_fs_type.name, NULL);
        if (IS_ERR(shm_mnt)) {
                error = PTR_ERR(shm_mnt);
                printk(KERN_ERR "Could not kern_mount tmpfs\n");
@@ -2821,9 +2391,9 @@ int __init init_tmpfs(void)
        return 0;
 out1:
-        unregister_filesystem(&tmpfs_fs_type);
+        unregister_filesystem(&shmem_fs_type);
 out2:
-        destroy_inodecache();
+        shmem_destroy_inodecache();
 out3:
        bdi_destroy(&shmem_backing_dev_info);
 out4:
@@ -2831,45 +2401,6 @@ out4:
        return error;
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
- * @inode: the inode to be searched
- * @pgoff: the offset to be searched
- * @pagep: the pointer for the found page to be stored
- * @ent: the pointer for the found swap entry to be stored
- *
- * If a page is found, refcount of it is incremented. Callers should handle
- * these refcount.
- */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
-                                        struct page **pagep, swp_entry_t *ent)
-{
-        swp_entry_t entry = { .val = 0 }, *ptr;
-        struct page *page = NULL;
-        struct shmem_inode_info *info = SHMEM_I(inode);
-        if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-                goto out;
-        spin_lock(&info->lock);
-        ptr = shmem_swp_entry(info, pgoff, NULL);
-#ifdef CONFIG_SWAP
-        if (ptr && ptr->val) {
-                entry.val = ptr->val;
-                page = find_get_page(&swapper_space, entry.val);
-        } else
-#endif
-                page = find_get_page(inode->i_mapping, pgoff);
-        if (ptr)
-                shmem_swp_unmap(ptr);
-        spin_unlock(&info->lock);
-out:
-        *pagep = page;
-        *ent = entry;
-}
-#endif
 #else /* !CONFIG_SHMEM */
 /*
@@ -2883,23 +2414,23 @@ out:
 #include <linux/ramfs.h>
-static struct file_system_type tmpfs_fs_type = {
+static struct file_system_type shmem_fs_type = {
        .name           = "tmpfs",
        .mount          = ramfs_mount,
        .kill_sb        = kill_litter_super,
 };
-int __init init_tmpfs(void)
+int __init shmem_init(void)
 {
-        BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
+        BUG_ON(register_filesystem(&shmem_fs_type) != 0);
-        shm_mnt = kern_mount(&tmpfs_fs_type);
+        shm_mnt = kern_mount(&shmem_fs_type);
        BUG_ON(IS_ERR(shm_mnt));
        return 0;
 }
-int shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t swap, struct page *page)
 {
        return 0;
 }
@@ -2909,43 +2440,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
        return 0;
 }
-void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
-        truncate_inode_pages_range(inode->i_mapping, start, end);
+        truncate_inode_pages_range(inode->i_mapping, lstart, lend);
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
- * @inode: the inode to be searched
- * @pgoff: the offset to be searched
- * @pagep: the pointer for the found page to be stored
- * @ent: the pointer for the found swap entry to be stored
- *
- * If a page is found, refcount of it is incremented. Callers should handle
- * these refcount.
- */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
-                                        struct page **pagep, swp_entry_t *ent)
-{
-        struct page *page = NULL;
-        if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-                goto out;
-        page = find_get_page(inode->i_mapping, pgoff);
-out:
-        *pagep = page;
-        *ent = (swp_entry_t){ .val = 0 };
-}
-#endif
 #define shmem_vm_ops                            generic_file_vm_ops
 #define shmem_file_operations                   ramfs_file_operations
 #define shmem_get_inode(sb, dir, mode, dev, flags)      ramfs_get_inode(sb, dir, mode, dev)
 #define shmem_acct_size(flags, size)            0
 #define shmem_unacct_size(flags, size)          do {} while (0)
-#define SHMEM_MAX_BYTES                         MAX_LFS_FILESIZE
 #endif /* CONFIG_SHMEM */
@@ -2969,7 +2474,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
        if (IS_ERR(shm_mnt))
                return (void *)shm_mnt;
-        if (size < 0 || size > SHMEM_MAX_BYTES)
+        if (size < 0 || size > MAX_LFS_FILESIZE)
                return ERR_PTR(-EINVAL);
        if (shmem_acct_size(flags, size))
@@ -3015,6 +2520,15 @@ put_memory:
 }
 EXPORT_SYMBOL_GPL(shmem_file_setup);
+void shmem_set_file(struct vm_area_struct *vma, struct file *file)
+{
+        if (vma->vm_file)
+                fput(vma->vm_file);
+        vma->vm_file = file;
+        vma->vm_ops = &shmem_vm_ops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
+}
 /**
 * shmem_zero_setup - setup a shared anonymous mapping
 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
@@ -3028,11 +2542,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
        if (IS_ERR(file))
                return PTR_ERR(file);
-        if (vma->vm_file)
+        shmem_set_file(vma, file);
-                fput(vma->vm_file);
-        vma->vm_file = file;
-        vma->vm_ops = &shmem_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
@@ -3048,13 +2558,29 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 * suit tmpfs, since it may have pages in swapcache, and needs to find those
 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
 *
- * Provide a stub for those callers to start using now, then later
+ * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
- * flesh it out to call shmem_getpage() with additional gfp mask, when
+ * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
- * shmem_file_splice_read() is added and shmem_readpage() is removed.
 */
 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                                         pgoff_t index, gfp_t gfp)
 {
+#ifdef CONFIG_SHMEM
+        struct inode *inode = mapping->host;
+        struct page *page;
+        int error;
+        BUG_ON(mapping->a_ops != &shmem_aops);
+        error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
+        if (error)
+                page = ERR_PTR(error);
+        else
+                unlock_page(page);
+        return page;
+#else
+        /*
+         * The tiny !SHMEM case uses ramfs without swap
+         */
        return read_cache_page_gfp(mapping, index, gfp);
+#endif
 }
 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/slab.c b/mm/slab.c
index d96e223de77..893c76df924 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic =
    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 /* internal cache of cache description objs */
+static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
 static struct kmem_cache cache_cache = {
+        .nodelists = cache_cache_nodelists,
        .batchcount = 1,
        .limit = BOOT_CPUCACHE_ENTRIES,
        .shared = 1,
@@ -593,6 +595,7 @@ static enum {
        PARTIAL_AC,
        PARTIAL_L3,
        EARLY,
+        LATE,
        FULL
 } g_cpucache_up;
@@ -620,37 +623,67 @@ int slab_is_available(void)
 static struct lock_class_key on_slab_l3_key;
 static struct lock_class_key on_slab_alc_key;
+static struct lock_class_key debugobj_l3_key;
+static struct lock_class_key debugobj_alc_key;
+static void slab_set_lock_classes(struct kmem_cache *cachep,
+                struct lock_class_key *l3_key, struct lock_class_key *alc_key,
+                int q)
+{
+        struct array_cache **alc;
+        struct kmem_list3 *l3;
+        int r;
+        l3 = cachep->nodelists[q];
+        if (!l3)
+                return;
+        lockdep_set_class(&l3->list_lock, l3_key);
+        alc = l3->alien;
+        /*
+         * FIXME: This check for BAD_ALIEN_MAGIC
+         * should go away when common slab code is taught to
+         * work even without alien caches.
+         * Currently, non NUMA code returns BAD_ALIEN_MAGIC
+         * for alloc_alien_cache,
+         */
+        if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
+                return;
+        for_each_node(r) {
+                if (alc[r])
+                        lockdep_set_class(&alc[r]->lock, alc_key);
+        }
+}
+static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+        slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
+}
+static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
+{
+        int node;
+        for_each_online_node(node)
+                slab_set_debugobj_lock_classes_node(cachep, node);
+}
 static void init_node_lock_keys(int q)
 {
        struct cache_sizes *s = malloc_sizes;
-        if (g_cpucache_up != FULL)
+        if (g_cpucache_up < LATE)
                return;
        for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
-                struct array_cache **alc;
                struct kmem_list3 *l3;
-                int r;
                l3 = s->cs_cachep->nodelists[q];
                if (!l3 || OFF_SLAB(s->cs_cachep))
                        continue;
-                lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
-                alc = l3->alien;
+                slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key,
-                /*
+                                &on_slab_alc_key, q);
-                 * FIXME: This check for BAD_ALIEN_MAGIC
-                 * should go away when common slab code is taught to
-                 * work even without alien caches.
-                 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
-                 * for alloc_alien_cache,
-                 */
-                if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
-                        continue;
-                for_each_node(r) {
-                        if (alc[r])
-                                lockdep_set_class(&alc[r]->lock,
-                                        &on_slab_alc_key);
-                }
        }
 }
@@ -669,6 +702,14 @@ static void init_node_lock_keys(int q)
 static inline void init_lock_keys(void)
 {
 }
+static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+}
+static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
+{
+}
 #endif
 /*
@@ -1262,6 +1303,8 @@ static int __cpuinit cpuup_prepare(long cpu)
                spin_unlock_irq(&l3->list_lock);
                kfree(shared);
                free_alien_cache(alien);
+                if (cachep->flags & SLAB_DEBUG_OBJECTS)
+                        slab_set_debugobj_lock_classes_node(cachep, node);
        }
        init_node_lock_keys(node);
@@ -1492,11 +1535,10 @@ void __init kmem_cache_init(void)
        cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
        /*
-         * struct kmem_cache size depends on nr_node_ids, which
+         * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
-         * can be less than MAX_NUMNODES.
         */
-        cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
+        cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
-                                 nr_node_ids * sizeof(struct kmem_list3 *);
+                                  nr_node_ids * sizeof(struct kmem_list3 *);
 #if DEBUG
        cache_cache.obj_size = cache_cache.buffer_size;
 #endif
@@ -1625,6 +1667,11 @@ void __init kmem_cache_init_late(void)
 {
        struct kmem_cache *cachep;
+        g_cpucache_up = LATE;
+        /* Annotate slab for lockdep -- annotate the malloc caches */
+        init_lock_keys();
        /* 6) resize the head arrays to their final sizes */
        mutex_lock(&cache_chain_mutex);
        list_for_each_entry(cachep, &cache_chain, next)
@@ -1635,9 +1682,6 @@ void __init kmem_cache_init_late(void)
        /* Done! */
        g_cpucache_up = FULL;
-        /* Annotate slab for lockdep -- annotate the malloc caches */
-        init_lock_keys();
        /*
         * Register a cpu startup notifier callback that initializes
         * cpu_cache_get for all new cpus
@@ -2308,6 +2352,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (!cachep)
                goto oops;
+        cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
 #if DEBUG
        cachep->obj_size = size;
@@ -2424,6 +2469,16 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                goto oops;
        }
+        if (flags & SLAB_DEBUG_OBJECTS) {
+                /*
+                 * Would deadlock through slab_destroy()->call_rcu()->
+                 * debug_object_activate()->kmem_cache_alloc().
+                 */
+                WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
+                slab_set_debugobj_lock_classes(cachep);
+        }
        /* cache setup completed, link it into the list */
        list_add(&cachep->next, &cache_chain);
 oops:
@@ -3153,12 +3208,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON)
                cachep->ctor(objp);
-#if ARCH_SLAB_MINALIGN
+        if (ARCH_SLAB_MINALIGN &&
-        if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
+            ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
-                       objp, ARCH_SLAB_MINALIGN);
+                       objp, (int)ARCH_SLAB_MINALIGN);
        }
-#endif
        return objp;
 }
 #else
@@ -3402,7 +3456,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
        cache_alloc_debugcheck_before(cachep, flags);
        local_irq_save(save_flags);
-        if (nodeid == -1)
+        if (nodeid == NUMA_NO_NODE)
                nodeid = slab_node;
        if (unlikely(!cachep->nodelists[nodeid])) {
@@ -3933,7 +3987,7 @@ fail:
 struct ccupdate_struct {
        struct kmem_cache *cachep;
-        struct array_cache *new[NR_CPUS];
+        struct array_cache *new[0];
 };
 static void do_ccupdate_local(void *info)
@@ -3955,7 +4009,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
        struct ccupdate_struct *new;
        int i;
-        new = kzalloc(sizeof(*new), gfp);
+        new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
+                      gfp);
        if (!new)
                return -ENOMEM;
diff --git a/mm/slob.c b/mm/slob.c
index 46e0aee33a2..bf391818716 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -70,7 +70,7 @@
 #include <trace/events/kmem.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 /*
 * slob_block has a field 'units', which indicates size of block if +ve,
@@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
        int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
        void *ret;
+        gfp &= gfp_allowed_mask;
        lockdep_trace_alloc(gfp);
        if (size < PAGE_SIZE - align) {
@@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
        void *b;
+        flags &= gfp_allowed_mask;
+        lockdep_trace_alloc(flags);
        if (c->size < PAGE_SIZE) {
                b = slob_alloc(c->size, flags, c->align, node);
                trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
diff --git a/mm/slub.c b/mm/slub.c
index 35f351f2619..f73234db904 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2,10 +2,11 @@
 * SLUB: A slab allocator that limits cache line use instead of queuing
 * objects in per cpu and per node lists.
 *
- * The allocator synchronizes using per slab locks and only
+ * The allocator synchronizes using per slab locks or atomic operatios
- * uses a centralized lock to manage a pool of partial slabs.
+ * and only uses a centralized lock to manage a pool of partial slabs.
 *
 * (C) 2007 SGI, Christoph Lameter
+ * (C) 2011 Linux Foundation, Christoph Lameter
 */
 #include <linux/mm.h>
@@ -27,20 +28,33 @@
 #include <linux/memory.h>
 #include <linux/math64.h>
 #include <linux/fault-inject.h>
+#include <linux/stacktrace.h>
 #include <trace/events/kmem.h>
 /*
 * Lock order:
- *   1. slab_lock(page)
+ *   1. slub_lock (Global Semaphore)
- *   2. slab->list_lock
+ *   2. node->list_lock
+ *   3. slab_lock(page) (Only on some arches and for debugging)
 *
- *   The slab_lock protects operations on the object of a particular
+ *   slub_lock
- *   slab and its metadata in the page struct. If the slab lock
+ *
- *   has been taken then no allocations nor frees can be performed
+ *   The role of the slub_lock is to protect the list of all the slabs
- *   on the objects in the slab nor can the slab be added or removed
+ *   and to synchronize major metadata changes to slab cache structures.
- *   from the partial or full lists since this would mean modifying
+ *
- *   the page_struct of the slab.
+ *   The slab_lock is only used for debugging and on arches that do not
+ *   have the ability to do a cmpxchg_double. It only protects the second
+ *   double word in the page struct. Meaning
+ *      A. page->freelist       -> List of object free in a page
+ *      B. page->counters       -> Counters of objects
+ *      C. page->frozen         -> frozen state
+ *
+ *   If a slab is frozen then it is exempt from list management. It is not
+ *   on any list. The processor that froze the slab is the one who can
+ *   perform list operations on the page. Other processors may put objects
+ *   onto the freelist but the processor that froze the slab is the only
+ *   one that can retrieve the objects from the page's freelist.
 *
 *   The list_lock protects the partial and full list on each node and
 *   the partial slab counter. If taken then no new slabs may be added or
@@ -53,20 +67,6 @@
 *   slabs, operations can continue without any centralized lock. F.e.
 *   allocating a long series of objects that fill up slabs does not require
 *   the list lock.
- *
- *   The lock order is sometimes inverted when we are trying to get a slab
- *   off a list. We take the list_lock and then look for a page on the list
- *   to use. While we do that objects in the slabs may be freed. We can
- *   only operate on the slab if we have also taken the slab_lock. So we use
- *   a slab_trylock() on the slab. If trylock was successful then no frees
- *   can occur anymore and we can use the slab for allocations etc. If the
- *   slab_trylock() does not succeed then frees are in progress in the slab and
- *   we must stay away from it for a while since we may cause a bouncing
- *   cacheline if we try to acquire the lock. So go onto the next slab.
- *   If all pages are busy then we may allocate a new slab instead of reusing
- *   a partial slab. A new slab has no one operating on it and thus there is
- *   no danger of cacheline contention.
- *
 *   Interrupts are disabled during allocation and deallocation in order to
 *   make the slab allocator safe to use in the context of an irq. In addition
 *   interrupts are disabled to ensure that the processor does not change
@@ -131,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 /* Enable to test recovery from slab corruption on boot */
 #undef SLUB_RESILIENCY_TEST
+/* Enable to log cmpxchg failures */
+#undef SLUB_DEBUG_CMPXCHG
 /*
 * Mininum number of partial slabs. These will be left on the partial
 * lists even if they are empty. kmem_cache_shrink may reclaim them.
@@ -166,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 #define OO_SHIFT        16
 #define OO_MASK         ((1 << OO_SHIFT) - 1)
-#define MAX_OBJS_PER_PAGE       65535 /* since page.objects is u16 */
+#define MAX_OBJS_PER_PAGE       32767 /* since page.objects is u15 */
 /* Internal SLUB flags */
 #define __OBJECT_POISON         0x80000000UL /* Poison object */
+#define __CMPXCHG_DOUBLE        0x40000000UL /* Use cmpxchg_double */
 static int kmem_size = sizeof(struct kmem_cache);
@@ -191,8 +195,12 @@ static LIST_HEAD(slab_caches);
 /*
 * Tracking user of a slab.
 */
+#define TRACK_ADDRS_COUNT 16
 struct track {
        unsigned long addr;     /* Called from address */
+#ifdef CONFIG_STACKTRACE
+        unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
+#endif
        int cpu;                /* Was running on cpu */
        int pid;                /* Pid context */
        unsigned long when;     /* When did the operation occur */
@@ -338,11 +346,99 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
        return x.x & OO_MASK;
 }
+/*
+ * Per slab locking using the pagelock
+ */
+static __always_inline void slab_lock(struct page *page)
+{
+        bit_spin_lock(PG_locked, &page->flags);
+}
+static __always_inline void slab_unlock(struct page *page)
+{
+        __bit_spin_unlock(PG_locked, &page->flags);
+}
+/* Interrupts must be disabled (for the fallback code to work right) */
+static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+                void *freelist_old, unsigned long counters_old,
+                void *freelist_new, unsigned long counters_new,
+                const char *n)
+{
+        VM_BUG_ON(!irqs_disabled());
+#ifdef CONFIG_CMPXCHG_DOUBLE
+        if (s->flags & __CMPXCHG_DOUBLE) {
+                if (cmpxchg_double(&page->freelist,
+                        freelist_old, counters_old,
+                        freelist_new, counters_new))
+                return 1;
+        } else
+#endif
+        {
+                slab_lock(page);
+                if (page->freelist == freelist_old && page->counters == counters_old) {
+                        page->freelist = freelist_new;
+                        page->counters = counters_new;
+                        slab_unlock(page);
+                        return 1;
+                }
+                slab_unlock(page);
+        }
+        cpu_relax();
+        stat(s, CMPXCHG_DOUBLE_FAIL);
+#ifdef SLUB_DEBUG_CMPXCHG
+        printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
+#endif
+        return 0;
+}
+static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+                void *freelist_old, unsigned long counters_old,
+                void *freelist_new, unsigned long counters_new,
+                const char *n)
+{
+#ifdef CONFIG_CMPXCHG_DOUBLE
+        if (s->flags & __CMPXCHG_DOUBLE) {
+                if (cmpxchg_double(&page->freelist,
+                        freelist_old, counters_old,
+                        freelist_new, counters_new))
+                return 1;
+        } else
+#endif
+        {
+                unsigned long flags;
+                local_irq_save(flags);
+                slab_lock(page);
+                if (page->freelist == freelist_old && page->counters == counters_old) {
+                        page->freelist = freelist_new;
+                        page->counters = counters_new;
+                        slab_unlock(page);
+                        local_irq_restore(flags);
+                        return 1;
+                }
+                slab_unlock(page);
+                local_irq_restore(flags);
+        }
+        cpu_relax();
+        stat(s, CMPXCHG_DOUBLE_FAIL);
+#ifdef SLUB_DEBUG_CMPXCHG
+        printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
+#endif
+        return 0;
+}
 #ifdef CONFIG_SLUB_DEBUG
 /*
 * Determine a map of object in use on a page.
 *
- * Slab lock or node listlock must be held to guarantee that the page does
+ * Node listlock must be held to guarantee that the page does
 * not vanish from under us.
 */
 static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
@@ -420,6 +516,24 @@ static void set_track(struct kmem_cache *s, void *object,
        struct track *p = get_track(s, object, alloc);
        if (addr) {
+#ifdef CONFIG_STACKTRACE
+                struct stack_trace trace;
+                int i;
+                trace.nr_entries = 0;
+                trace.max_entries = TRACK_ADDRS_COUNT;
+                trace.entries = p->addrs;
+                trace.skip = 3;
+                save_stack_trace(&trace);
+                /* See rant in lockdep.c */
+                if (trace.nr_entries != 0 &&
+                    trace.entries[trace.nr_entries - 1] == ULONG_MAX)
+                        trace.nr_entries--;
+                for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
+                        p->addrs[i] = 0;
+#endif
                p->addr = addr;
                p->cpu = smp_processor_id();
                p->pid = current->pid;
@@ -444,6 +558,16 @@ static void print_track(const char *s, struct track *t)
        printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
                s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
+#ifdef CONFIG_STACKTRACE
+        {
+                int i;
+                for (i = 0; i < TRACK_ADDRS_COUNT; i++)
+                        if (t->addrs[i])
+                                printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
+                        else
+                                break;
+        }
+#endif
 }
 static void print_tracking(struct kmem_cache *s, void *object)
@@ -557,10 +681,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
                memset(p + s->objsize, val, s->inuse - s->objsize);
 }
-static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
+static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
 {
        while (bytes) {
-                if (*start != (u8)value)
+                if (*start != value)
                        return start;
                start++;
                bytes--;
@@ -568,6 +692,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
        return NULL;
 }
+static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
+{
+        u64 value64;
+        unsigned int words, prefix;
+        if (bytes <= 16)
+                return check_bytes8(start, value, bytes);
+        value64 = value | value << 8 | value << 16 | value << 24;
+        value64 = (value64 & 0xffffffff) | value64 << 32;
+        prefix = 8 - ((unsigned long)start) % 8;
+        if (prefix) {
+                u8 *r = check_bytes8(start, value, prefix);
+                if (r)
+                        return r;
+                start += prefix;
+                bytes -= prefix;
+        }
+        words = bytes / 8;
+        while (words) {
+                if (*(u64 *)start != value64)
+                        return check_bytes8(start, value, 8);
+                start += 8;
+                words--;
+        }
+        return check_bytes8(start, value, bytes % 8);
+}
 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
                                                void *from, void *to)
 {
@@ -773,10 +929,11 @@ static int check_slab(struct kmem_cache *s, struct page *page)
 static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 {
        int nr = 0;
-        void *fp = page->freelist;
+        void *fp;
        void *object = NULL;
        unsigned long max_objects;
+        fp = page->freelist;
        while (fp && nr <= page->objects) {
                if (fp == search)
                        return 1;
@@ -881,26 +1038,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
 /*
 * Tracking of fully allocated slabs for debugging purposes.
+ *
+ * list_lock must be held.
 */
-static void add_full(struct kmem_cache_node *n, struct page *page)
+static void add_full(struct kmem_cache *s,
+        struct kmem_cache_node *n, struct page *page)
 {
-        spin_lock(&n->list_lock);
+        if (!(s->flags & SLAB_STORE_USER))
+                return;
        list_add(&page->lru, &n->full);
-        spin_unlock(&n->list_lock);
 }
+/*
+ * list_lock must be held.
+ */
 static void remove_full(struct kmem_cache *s, struct page *page)
 {
-        struct kmem_cache_node *n;
        if (!(s->flags & SLAB_STORE_USER))
                return;
-        n = get_node(s, page_to_nid(page));
-        spin_lock(&n->list_lock);
        list_del(&page->lru);
-        spin_unlock(&n->list_lock);
 }
 /* Tracking of the number of slabs for debugging purposes */
@@ -956,11 +1114,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa
        if (!check_slab(s, page))
                goto bad;
-        if (!on_freelist(s, page, object)) {
-                object_err(s, page, object, "Object already allocated");
-                goto bad;
-        }
        if (!check_valid_pointer(s, page, object)) {
                object_err(s, page, object, "Freelist Pointer check fails");
                goto bad;
@@ -993,6 +1146,12 @@ bad:
 static noinline int free_debug_processing(struct kmem_cache *s,
                 struct page *page, void *object, unsigned long addr)
 {
+        unsigned long flags;
+        int rc = 0;
+        local_irq_save(flags);
+        slab_lock(page);
        if (!check_slab(s, page))
                goto fail;
@@ -1007,7 +1166,7 @@ static noinline int free_debug_processing(struct kmem_cache *s,
        }
        if (!check_object(s, page, object, SLUB_RED_ACTIVE))
-                return 0;
+                goto out;
        if (unlikely(s != page->slab)) {
                if (!PageSlab(page)) {
@@ -1024,18 +1183,19 @@ static noinline int free_debug_processing(struct kmem_cache *s,
                goto fail;
        }
-        /* Special debug activities for freeing objects */
-        if (!PageSlubFrozen(page) && !page->freelist)
-                remove_full(s, page);
        if (s->flags & SLAB_STORE_USER)
                set_track(s, object, TRACK_FREE, addr);
        trace(s, page, object, 0);
        init_object(s, object, SLUB_RED_INACTIVE);
-        return 1;
+        rc = 1;
+out:
+        slab_unlock(page);
+        local_irq_restore(flags);
+        return rc;
 fail:
        slab_fix(s, "Object at 0x%p not freed", object);
-        return 0;
+        goto out;
 }
 static int __init setup_slub_debug(char *str)
@@ -1135,7 +1295,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
                        { return 1; }
 static inline int check_object(struct kmem_cache *s, struct page *page,
                        void *object, u8 val) { return 1; }
-static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
+static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
+                                        struct page *page) {}
+static inline void remove_full(struct kmem_cache *s, struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
        void (*ctor)(void *))
@@ -1187,6 +1349,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
        struct kmem_cache_order_objects oo = s->oo;
        gfp_t alloc_gfp;
+        flags &= gfp_allowed_mask;
+        if (flags & __GFP_WAIT)
+                local_irq_enable();
        flags |= s->allocflags;
        /*
@@ -1203,12 +1370,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                 * Try a lower order alloc if possible
                 */
                page = alloc_slab_page(flags, node, oo);
-                if (!page)
-                        return NULL;
-                stat(s, ORDER_FALLBACK);
+                if (page)
+                        stat(s, ORDER_FALLBACK);
        }
+        if (flags & __GFP_WAIT)
+                local_irq_disable();
+        if (!page)
+                return NULL;
        if (kmemcheck_enabled
                && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
                int pages = 1 << oo_order(oo);
@@ -1276,6 +1448,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        page->freelist = start;
        page->inuse = 0;
+        page->frozen = 1;
 out:
        return page;
 }
@@ -1353,77 +1526,87 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
 }
 /*
- * Per slab locking using the pagelock
+ * Management of partially allocated slabs.
- */
+ *
-static __always_inline void slab_lock(struct page *page)
+ * list_lock must be held.
-{
-        bit_spin_lock(PG_locked, &page->flags);
-}
-static __always_inline void slab_unlock(struct page *page)
-{
-        __bit_spin_unlock(PG_locked, &page->flags);
-}
-static __always_inline int slab_trylock(struct page *page)
-{
-        int rc = 1;
-        rc = bit_spin_trylock(PG_locked, &page->flags);
-        return rc;
-}
-/*
- * Management of partially allocated slabs
 */
-static void add_partial(struct kmem_cache_node *n,
+static inline void add_partial(struct kmem_cache_node *n,
                                struct page *page, int tail)
 {
-        spin_lock(&n->list_lock);
        n->nr_partial++;
        if (tail)
                list_add_tail(&page->lru, &n->partial);
        else
                list_add(&page->lru, &n->partial);
-        spin_unlock(&n->list_lock);
 }
-static inline void __remove_partial(struct kmem_cache_node *n,
+/*
+ * list_lock must be held.
+ */
+static inline void remove_partial(struct kmem_cache_node *n,
                                        struct page *page)
 {
        list_del(&page->lru);
        n->nr_partial--;
 }
-static void remove_partial(struct kmem_cache *s, struct page *page)
-{
-        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
-        spin_lock(&n->list_lock);
-        __remove_partial(n, page);
-        spin_unlock(&n->list_lock);
-}
 /*
- * Lock slab and remove from the partial list.
+ * Lock slab, remove from the partial list and put the object into the
+ * per cpu freelist.
 *
 * Must hold list_lock.
 */
-static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
+static inline int acquire_slab(struct kmem_cache *s,
-                                                        struct page *page)
+                struct kmem_cache_node *n, struct page *page)
 {
-        if (slab_trylock(page)) {
+        void *freelist;
-                __remove_partial(n, page);
+        unsigned long counters;
-                __SetPageSlubFrozen(page);
+        struct page new;
+        /*
+         * Zap the freelist and set the frozen bit.
+         * The old freelist is the list of objects for the
+         * per cpu allocation list.
+         */
+        do {
+                freelist = page->freelist;
+                counters = page->counters;
+                new.counters = counters;
+                new.inuse = page->objects;
+                VM_BUG_ON(new.frozen);
+                new.frozen = 1;
+        } while (!__cmpxchg_double_slab(s, page,
+                        freelist, counters,
+                        NULL, new.counters,
+                        "lock and freeze"));
+        remove_partial(n, page);
+        if (freelist) {
+                /* Populate the per cpu freelist */
+                this_cpu_write(s->cpu_slab->freelist, freelist);
+                this_cpu_write(s->cpu_slab->page, page);
+                this_cpu_write(s->cpu_slab->node, page_to_nid(page));
                return 1;
+        } else {
+                /*
+                 * Slab page came from the wrong list. No object to allocate
+                 * from. Put it onto the correct list and continue partial
+                 * scan.
+                 */
+                printk(KERN_ERR "SLUB: %s : Page without available objects on"
+                        " partial list\n", s->name);
+                return 0;
        }
-        return 0;
 }
 /*
 * Try to allocate a partial slab from a specific node.
 */
-static struct page *get_partial_node(struct kmem_cache_node *n)
+static struct page *get_partial_node(struct kmem_cache *s,
+                                        struct kmem_cache_node *n)
 {
        struct page *page;
@@ -1438,7 +1621,7 @@ static struct page *get_partial_node(struct kmem_cache_node *n)
        spin_lock(&n->list_lock);
        list_for_each_entry(page, &n->partial, lru)
-                if (lock_and_freeze_slab(n, page))
+                if (acquire_slab(s, n, page))
                        goto out;
        page = NULL;
 out:
@@ -1489,7 +1672,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
                if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
                                n->nr_partial > s->min_partial) {
-                        page = get_partial_node(n);
+                        page = get_partial_node(s, n);
                        if (page) {
                                put_mems_allowed();
                                return page;
@@ -1509,60 +1692,13 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
        struct page *page;
        int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
-        page = get_partial_node(get_node(s, searchnode));
+        page = get_partial_node(s, get_node(s, searchnode));
        if (page || node != NUMA_NO_NODE)
                return page;
        return get_any_partial(s, flags);
 }
-/*
- * Move a page back to the lists.
- *
- * Must be called with the slab lock held.
- *
- * On exit the slab lock will have been dropped.
- */
-static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
-        __releases(bitlock)
-{
-        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
-        __ClearPageSlubFrozen(page);
-        if (page->inuse) {
-                if (page->freelist) {
-                        add_partial(n, page, tail);
-                        stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
-                } else {
-                        stat(s, DEACTIVATE_FULL);
-                        if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
-                                add_full(n, page);
-                }
-                slab_unlock(page);
-        } else {
-                stat(s, DEACTIVATE_EMPTY);
-                if (n->nr_partial < s->min_partial) {
-                        /*
-                         * Adding an empty slab to the partial slabs in order
-                         * to avoid page allocator overhead. This slab needs
-                         * to come after the other slabs with objects in
-                         * so that the others get filled first. That way the
-                         * size of the partial list stays small.
-                         *
-                         * kmem_cache_shrink can reclaim any empty slabs from
-                         * the partial list.
-                         */
-                        add_partial(n, page, 1);
-                        slab_unlock(page);
-                } else {
-                        slab_unlock(page);
-                        stat(s, FREE_SLAB);
-                        discard_slab(s, page);
-                }
-        }
-}
 #ifdef CONFIG_PREEMPT
 /*
 * Calculate the next globally unique transaction for disambiguiation
@@ -1632,42 +1768,161 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
 /*
 * Remove the cpu slab
 */
+/*
+ * Remove the cpu slab
+ */
 static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
-        __releases(bitlock)
 {
+        enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
        struct page *page = c->page;
-        int tail = 1;
+        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+        int lock = 0;
-        if (page->freelist)
+        enum slab_modes l = M_NONE, m = M_NONE;
+        void *freelist;
+        void *nextfree;
+        int tail = 0;
+        struct page new;
+        struct page old;
+        if (page->freelist) {
                stat(s, DEACTIVATE_REMOTE_FREES);
+                tail = 1;
+        }
+        c->tid = next_tid(c->tid);
+        c->page = NULL;
+        freelist = c->freelist;
+        c->freelist = NULL;
        /*
-         * Merge cpu freelist into slab freelist. Typically we get here
+         * Stage one: Free all available per cpu objects back
-         * because both freelists are empty. So this is unlikely
+         * to the page freelist while it is still frozen. Leave the
-         * to occur.
+         * last one.
+         *
+         * There is no need to take the list->lock because the page
+         * is still frozen.
         */
-        while (unlikely(c->freelist)) {
+        while (freelist && (nextfree = get_freepointer(s, freelist))) {
-                void **object;
+                void *prior;
+                unsigned long counters;
+                do {
+                        prior = page->freelist;
+                        counters = page->counters;
+                        set_freepointer(s, freelist, prior);
+                        new.counters = counters;
+                        new.inuse--;
+                        VM_BUG_ON(!new.frozen);
+                } while (!__cmpxchg_double_slab(s, page,
+                        prior, counters,
+                        freelist, new.counters,
+                        "drain percpu freelist"));
+                freelist = nextfree;
+        }
-                tail = 0;       /* Hot objects. Put the slab first */
+        /*
+         * Stage two: Ensure that the page is unfrozen while the
+         * list presence reflects the actual number of objects
+         * during unfreeze.
+         *
+         * We setup the list membership and then perform a cmpxchg
+         * with the count. If there is a mismatch then the page
+         * is not unfrozen but the page is on the wrong list.
+         *
+         * Then we restart the process which may have to remove
+         * the page from the list that we just put it on again
+         * because the number of objects in the slab may have
+         * changed.
+         */
+redo:
-                /* Retrieve object from cpu_freelist */
+        old.freelist = page->freelist;
-                object = c->freelist;
+        old.counters = page->counters;
-                c->freelist = get_freepointer(s, c->freelist);
+        VM_BUG_ON(!old.frozen);
-                /* And put onto the regular freelist */
+        /* Determine target state of the slab */
-                set_freepointer(s, object, page->freelist);
+        new.counters = old.counters;
-                page->freelist = object;
+        if (freelist) {
-                page->inuse--;
+                new.inuse--;
+                set_freepointer(s, freelist, old.freelist);
+                new.freelist = freelist;
+        } else
+                new.freelist = old.freelist;
+        new.frozen = 0;
+        if (!new.inuse && n->nr_partial > s->min_partial)
+                m = M_FREE;
+        else if (new.freelist) {
+                m = M_PARTIAL;
+                if (!lock) {
+                        lock = 1;
+                        /*
+                         * Taking the spinlock removes the possiblity
+                         * that acquire_slab() will see a slab page that
+                         * is frozen
+                         */
+                        spin_lock(&n->list_lock);
+                }
+        } else {
+                m = M_FULL;
+                if (kmem_cache_debug(s) && !lock) {
+                        lock = 1;
+                        /*
+                         * This also ensures that the scanning of full
+                         * slabs from diagnostic functions will not see
+                         * any frozen slabs.
+                         */
+                        spin_lock(&n->list_lock);
+                }
+        }
+        if (l != m) {
+                if (l == M_PARTIAL)
+                        remove_partial(n, page);
+                else if (l == M_FULL)
+                        remove_full(s, page);
+                if (m == M_PARTIAL) {
+                        add_partial(n, page, tail);
+                        stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
+                } else if (m == M_FULL) {
+                        stat(s, DEACTIVATE_FULL);
+                        add_full(s, n, page);
+                }
+        }
+        l = m;
+        if (!__cmpxchg_double_slab(s, page,
+                                old.freelist, old.counters,
+                                new.freelist, new.counters,
+                                "unfreezing slab"))
+                goto redo;
+        if (lock)
+                spin_unlock(&n->list_lock);
+        if (m == M_FREE) {
+                stat(s, DEACTIVATE_EMPTY);
+                discard_slab(s, page);
+                stat(s, FREE_SLAB);
        }
-        c->page = NULL;
-        c->tid = next_tid(c->tid);
-        unfreeze_slab(s, page, tail);
 }
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
        stat(s, CPUSLAB_FLUSH);
-        slab_lock(c->page);
        deactivate_slab(s, c);
 }
@@ -1796,6 +2051,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
        void **object;
        struct page *page;
        unsigned long flags;
+        struct page new;
+        unsigned long counters;
        local_irq_save(flags);
 #ifdef CONFIG_PREEMPT
@@ -1814,72 +2071,102 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
        if (!page)
                goto new_slab;
-        slab_lock(page);
+        if (unlikely(!node_match(c, node))) {
-        if (unlikely(!node_match(c, node)))
+                stat(s, ALLOC_NODE_MISMATCH);
-                goto another_slab;
+                deactivate_slab(s, c);
+                goto new_slab;
+        }
+        /* must check again c->freelist in case of cpu migration or IRQ */
+        object = c->freelist;
+        if (object)
+                goto load_freelist;
+        stat(s, ALLOC_SLOWPATH);
+        do {
+                object = page->freelist;
+                counters = page->counters;
+                new.counters = counters;
+                VM_BUG_ON(!new.frozen);
+                /*
+                 * If there is no object left then we use this loop to
+                 * deactivate the slab which is simple since no objects
+                 * are left in the slab and therefore we do not need to
+                 * put the page back onto the partial list.
+                 *
+                 * If there are objects left then we retrieve them
+                 * and use them to refill the per cpu queue.
+                */
+                new.inuse = page->objects;
+                new.frozen = object != NULL;
+        } while (!__cmpxchg_double_slab(s, page,
+                        object, counters,
+                        NULL, new.counters,
+                        "__slab_alloc"));
+        if (unlikely(!object)) {
+                c->page = NULL;
+                stat(s, DEACTIVATE_BYPASS);
+                goto new_slab;
+        }
        stat(s, ALLOC_REFILL);
 load_freelist:
-        object = page->freelist;
+        VM_BUG_ON(!page->frozen);
-        if (unlikely(!object))
-                goto another_slab;
-        if (kmem_cache_debug(s))
-                goto debug;
        c->freelist = get_freepointer(s, object);
-        page->inuse = page->objects;
-        page->freelist = NULL;
-        slab_unlock(page);
        c->tid = next_tid(c->tid);
        local_irq_restore(flags);
-        stat(s, ALLOC_SLOWPATH);
        return object;
-another_slab:
-        deactivate_slab(s, c);
 new_slab:
        page = get_partial(s, gfpflags, node);
        if (page) {
                stat(s, ALLOC_FROM_PARTIAL);
-                c->node = page_to_nid(page);
+                object = c->freelist;
-                c->page = page;
+                if (kmem_cache_debug(s))
+                        goto debug;
                goto load_freelist;
        }
-        gfpflags &= gfp_allowed_mask;
-        if (gfpflags & __GFP_WAIT)
-                local_irq_enable();
        page = new_slab(s, gfpflags, node);
-        if (gfpflags & __GFP_WAIT)
-                local_irq_disable();
        if (page) {
                c = __this_cpu_ptr(s->cpu_slab);
-                stat(s, ALLOC_SLAB);
                if (c->page)
                        flush_slab(s, c);
-                slab_lock(page);
+                /*
-                __SetPageSlubFrozen(page);
+                 * No other reference to the page yet so we can
+                 * muck around with it freely without cmpxchg
+                 */
+                object = page->freelist;
+                page->freelist = NULL;
+                page->inuse = page->objects;
+                stat(s, ALLOC_SLAB);
                c->node = page_to_nid(page);
                c->page = page;
+                if (kmem_cache_debug(s))
+                        goto debug;
                goto load_freelist;
        }
        if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
                slab_out_of_memory(s, gfpflags, node);
        local_irq_restore(flags);
        return NULL;
 debug:
-        if (!alloc_debug_processing(s, page, object, addr))
+        if (!object || !alloc_debug_processing(s, page, object, addr))
-                goto another_slab;
+                goto new_slab;
-        page->inuse++;
+        c->freelist = get_freepointer(s, object);
-        page->freelist = get_freepointer(s, object);
        deactivate_slab(s, c);
        c->page = NULL;
        c->node = NUMA_NO_NODE;
@@ -2031,52 +2318,89 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 {
        void *prior;
        void **object = (void *)x;
-        unsigned long flags;
+        int was_frozen;
+        int inuse;
+        struct page new;
+        unsigned long counters;
+        struct kmem_cache_node *n = NULL;
+        unsigned long uninitialized_var(flags);
-        local_irq_save(flags);
-        slab_lock(page);
        stat(s, FREE_SLOWPATH);
        if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
-                goto out_unlock;
+                return;
-        prior = page->freelist;
+        do {
-        set_freepointer(s, object, prior);
+                prior = page->freelist;
-        page->freelist = object;
+                counters = page->counters;
-        page->inuse--;
+                set_freepointer(s, object, prior);
+                new.counters = counters;
+                was_frozen = new.frozen;
+                new.inuse--;
+                if ((!new.inuse || !prior) && !was_frozen && !n) {
+                        n = get_node(s, page_to_nid(page));
+                        /*
+                         * Speculatively acquire the list_lock.
+                         * If the cmpxchg does not succeed then we may
+                         * drop the list_lock without any processing.
+                         *
+                         * Otherwise the list_lock will synchronize with
+                         * other processors updating the list of slabs.
+                         */
+                        spin_lock_irqsave(&n->list_lock, flags);
+                }
+                inuse = new.inuse;
-        if (unlikely(PageSlubFrozen(page))) {
+        } while (!cmpxchg_double_slab(s, page,
-                stat(s, FREE_FROZEN);
+                prior, counters,
-                goto out_unlock;
+                object, new.counters,
-        }
+                "__slab_free"));
-        if (unlikely(!page->inuse))
+        if (likely(!n)) {
-                goto slab_empty;
+                /*
+                 * The list lock was not taken therefore no list
+                 * activity can be necessary.
+                 */
+                if (was_frozen)
+                        stat(s, FREE_FROZEN);
+                return;
+        }
        /*
-         * Objects left in the slab. If it was not on the partial list before
+         * was_frozen may have been set after we acquired the list_lock in
-         * then add it.
+         * an earlier loop. So we need to check it here again.
         */
-        if (unlikely(!prior)) {
+        if (was_frozen)
-                add_partial(get_node(s, page_to_nid(page)), page, 1);
+                stat(s, FREE_FROZEN);
-                stat(s, FREE_ADD_PARTIAL);
+        else {
-        }
+                if (unlikely(!inuse && n->nr_partial > s->min_partial))
+                        goto slab_empty;
-out_unlock:
+                /*
-        slab_unlock(page);
+                 * Objects left in the slab. If it was not on the partial list before
-        local_irq_restore(flags);
+                 * then add it.
+                 */
+                if (unlikely(!prior)) {
+                        remove_full(s, page);
+                        add_partial(n, page, 1);
+                        stat(s, FREE_ADD_PARTIAL);
+                }
+        }
+        spin_unlock_irqrestore(&n->list_lock, flags);
        return;
 slab_empty:
        if (prior) {
                /*
-                 * Slab still on the partial list.
+                 * Slab on the partial list.
                 */
-                remove_partial(s, page);
+                remove_partial(n, page);
                stat(s, FREE_REMOVE_PARTIAL);
-        }
+        } else
-        slab_unlock(page);
+                /* Slab must be on the full list */
-        local_irq_restore(flags);
+                remove_full(s, page);
+        spin_unlock_irqrestore(&n->list_lock, flags);
        stat(s, FREE_SLAB);
        discard_slab(s, page);
 }
@@ -2350,7 +2674,6 @@ static void early_kmem_cache_node_alloc(int node)
 {
        struct page *page;
        struct kmem_cache_node *n;
-        unsigned long flags;
        BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
@@ -2368,6 +2691,7 @@ static void early_kmem_cache_node_alloc(int node)
        BUG_ON(!n);
        page->freelist = get_freepointer(kmem_cache_node, n);
        page->inuse++;
+        page->frozen = 0;
        kmem_cache_node->node[node] = n;
 #ifdef CONFIG_SLUB_DEBUG
        init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
@@ -2376,14 +2700,7 @@ static void early_kmem_cache_node_alloc(int node)
        init_kmem_cache_node(n, kmem_cache_node);
        inc_slabs_node(kmem_cache_node, node, page->objects);
-        /*
-         * lockdep requires consistent irq usage for each lock
-         * so even though there cannot be a race this early in
-         * the boot sequence, we still disable irqs.
-         */
-        local_irq_save(flags);
        add_partial(n, page, 0);
-        local_irq_restore(flags);
 }
 static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -2589,6 +2906,12 @@ static int kmem_cache_open(struct kmem_cache *s,
                }
        }
+#ifdef CONFIG_CMPXCHG_DOUBLE
+        if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
+                /* Enable fast mode */
+                s->flags |= __CMPXCHG_DOUBLE;
+#endif
        /*
         * The larger the object size is, the more pages we want on the partial
         * list to avoid pounding the page allocator excessively.
@@ -2661,7 +2984,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
        spin_lock_irqsave(&n->list_lock, flags);
        list_for_each_entry_safe(page, h, &n->partial, lru) {
                if (!page->inuse) {
-                        __remove_partial(n, page);
+                        remove_partial(n, page);
                        discard_slab(s, page);
                } else {
                        list_slab_objects(s, page,
@@ -2928,6 +3251,42 @@ size_t ksize(const void *object)
 }
 EXPORT_SYMBOL(ksize);
+#ifdef CONFIG_SLUB_DEBUG
+bool verify_mem_not_deleted(const void *x)
+{
+        struct page *page;
+        void *object = (void *)x;
+        unsigned long flags;
+        bool rv;
+        if (unlikely(ZERO_OR_NULL_PTR(x)))
+                return false;
+        local_irq_save(flags);
+        page = virt_to_head_page(x);
+        if (unlikely(!PageSlab(page))) {
+                /* maybe it was from stack? */
+                rv = true;
+                goto out_unlock;
+        }
+        slab_lock(page);
+        if (on_freelist(page->slab, page, object)) {
+                object_err(page->slab, page, object, "Object is on free-list");
+                rv = false;
+        } else {
+                rv = true;
+        }
+        slab_unlock(page);
+out_unlock:
+        local_irq_restore(flags);
+        return rv;
+}
+EXPORT_SYMBOL(verify_mem_not_deleted);
+#endif
 void kfree(const void *x)
 {
        struct page *page;
@@ -2993,14 +3352,8 @@ int kmem_cache_shrink(struct kmem_cache *s)
                 * list_lock. page->inuse here is the upper limit.
                 */
                list_for_each_entry_safe(page, t, &n->partial, lru) {
-                        if (!page->inuse && slab_trylock(page)) {
+                        if (!page->inuse) {
-                                /*
+                                remove_partial(n, page);
-                                 * Must hold slab lock here because slab_free
-                                 * may have freed the last object and be
-                                 * waiting to release the slab.
-                                 */
-                                __remove_partial(n, page);
-                                slab_unlock(page);
                                discard_slab(s, page);
                        } else {
                                list_move(&page->lru,
@@ -3588,12 +3941,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
 static void validate_slab_slab(struct kmem_cache *s, struct page *page,
                                                unsigned long *map)
 {
-        if (slab_trylock(page)) {
+        slab_lock(page);
-                validate_slab(s, page, map);
+        validate_slab(s, page, map);
-                slab_unlock(page);
+        slab_unlock(page);
-        } else
-                printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
-                        s->name, page);
 }
 static int validate_slab_node(struct kmem_cache *s,
@@ -4058,7 +4408,7 @@ static int any_slab_objects(struct kmem_cache *s)
 #endif
 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
-#define to_slab(n) container_of(n, struct kmem_cache, kobj);
+#define to_slab(n) container_of(n, struct kmem_cache, kobj)
 struct slab_attribute {
        struct attribute attr;
@@ -4241,8 +4591,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s,
                                const char *buf, size_t length)
 {
        s->flags &= ~SLAB_DEBUG_FREE;
-        if (buf[0] == '1')
+        if (buf[0] == '1') {
+                s->flags &= ~__CMPXCHG_DOUBLE;
                s->flags |= SLAB_DEBUG_FREE;
+        }
        return length;
 }
 SLAB_ATTR(sanity_checks);
@@ -4256,8 +4608,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
                                                        size_t length)
 {
        s->flags &= ~SLAB_TRACE;
-        if (buf[0] == '1')
+        if (buf[0] == '1') {
+                s->flags &= ~__CMPXCHG_DOUBLE;
                s->flags |= SLAB_TRACE;
+        }
        return length;
 }
 SLAB_ATTR(trace);
@@ -4274,8 +4628,10 @@ static ssize_t red_zone_store(struct kmem_cache *s,
                return -EBUSY;
        s->flags &= ~SLAB_RED_ZONE;
-        if (buf[0] == '1')
+        if (buf[0] == '1') {
+                s->flags &= ~__CMPXCHG_DOUBLE;
                s->flags |= SLAB_RED_ZONE;
+        }
        calculate_sizes(s, -1);
        return length;
 }
@@ -4293,8 +4649,10 @@ static ssize_t poison_store(struct kmem_cache *s,
                return -EBUSY;
        s->flags &= ~SLAB_POISON;
-        if (buf[0] == '1')
+        if (buf[0] == '1') {
+                s->flags &= ~__CMPXCHG_DOUBLE;
                s->flags |= SLAB_POISON;
+        }
        calculate_sizes(s, -1);
        return length;
 }
@@ -4312,8 +4670,10 @@ static ssize_t store_user_store(struct kmem_cache *s,
                return -EBUSY;
        s->flags &= ~SLAB_STORE_USER;
-        if (buf[0] == '1')
+        if (buf[0] == '1') {
+                s->flags &= ~__CMPXCHG_DOUBLE;
                s->flags |= SLAB_STORE_USER;
+        }
        calculate_sizes(s, -1);
        return length;
 }
@@ -4478,6 +4838,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
 STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
 STAT_ATTR(ALLOC_SLAB, alloc_slab);
 STAT_ATTR(ALLOC_REFILL, alloc_refill);
+STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
 STAT_ATTR(FREE_SLAB, free_slab);
 STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
 STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
@@ -4485,7 +4846,10 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
 STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
+STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
 STAT_ATTR(ORDER_FALLBACK, order_fallback);
+STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
+STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
 #endif
 static struct attribute *slab_attrs[] = {
@@ -4535,6 +4899,7 @@ static struct attribute *slab_attrs[] = {
        &alloc_from_partial_attr.attr,
        &alloc_slab_attr.attr,
        &alloc_refill_attr.attr,
+        &alloc_node_mismatch_attr.attr,
        &free_slab_attr.attr,
        &cpuslab_flush_attr.attr,
        &deactivate_full_attr.attr,
@@ -4542,7 +4907,10 @@ static struct attribute *slab_attrs[] = {
        &deactivate_to_head_attr.attr,
        &deactivate_to_tail_attr.attr,
        &deactivate_remote_frees_attr.attr,
+        &deactivate_bypass_attr.attr,
        &order_fallback_attr.attr,
+        &cmpxchg_double_fail_attr.attr,
+        &cmpxchg_double_cpu_fail_attr.attr,
 #endif
 #ifdef CONFIG_FAILSLAB
        &failslab_attr.attr,
diff --git a/mm/sparse.c b/mm/sparse.c
index aa64b12831a..858e1dff9b2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -40,7 +40,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
 static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
 #endif
-int page_to_nid(struct page *page)
+int page_to_nid(const struct page *page)
 {
        return section_to_node_table[page_to_section(page)];
 }
diff --git a/mm/swap.c b/mm/swap.c
index 3a442f18b0b..87627f181c3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page)
 {
        if (unlikely(PageTail(page))) {
                /* __split_huge_page_refcount can run under us */
-                struct page *page_head = page->first_page;
+                struct page *page_head = compound_trans_head(page);
-                smp_rmb();
-                /*
+                if (likely(page != page_head &&
-                 * If PageTail is still set after smp_rmb() we can be sure
+                           get_page_unless_zero(page_head))) {
-                 * that the page->first_page we read wasn't a dangling pointer.
-                 * See __split_huge_page_refcount() smp_wmb().
-                 */
-                if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
                        unsigned long flags;
                        /*
-                         * Verify that our page_head wasn't converted
+                         * page_head wasn't a dangling pointer but it
-                         * to a a regular page before we got a
+                         * may not be a head page anymore by the time
-                         * reference on it.
+                         * we obtain the lock. That is ok as long as it
+                         * can't be freed from under us.
                         */
-                        if (unlikely(!PageHead(page_head))) {
-                                /* PageHead is cleared after PageTail */
-                                smp_rmb();
-                                VM_BUG_ON(PageTail(page));
-                                goto out_put_head;
-                        }
-                        /*
-                         * Only run compound_lock on a valid PageHead,
-                         * after having it pinned with
-                         * get_page_unless_zero() above.
-                         */
-                        smp_mb();
-                        /* page_head wasn't a dangling pointer */
                        flags = compound_lock_irqsave(page_head);
                        if (unlikely(!PageTail(page))) {
                                /* __split_huge_page_refcount run before us */
                                compound_unlock_irqrestore(page_head, flags);
                                VM_BUG_ON(PageHead(page_head));
-                        out_put_head:
                                if (put_page_testzero(page_head))
                                        __put_single_page(page_head);
                        out_put_single:
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page)
                        VM_BUG_ON(page_head != page->first_page);
                        /*
                         * We can release the refcount taken by
-                         * get_page_unless_zero now that
+                         * get_page_unless_zero() now that
-                         * split_huge_page_refcount is blocked on the
+                         * __split_huge_page_refcount() is blocked on
-                         * compound_lock.
+                         * the compound_lock.
                         */
                        if (put_page_testzero(page_head))
                                VM_BUG_ON(1);
                        /* __split_huge_page_refcount will wait now */
-                        VM_BUG_ON(atomic_read(&page->_count) <= 0);
+                        VM_BUG_ON(page_mapcount(page) <= 0);
-                        atomic_dec(&page->_count);
+                        atomic_dec(&page->_mapcount);
                        VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+                        VM_BUG_ON(atomic_read(&page->_count) != 0);
                        compound_unlock_irqrestore(page_head, flags);
                        if (put_page_testzero(page_head)) {
                                if (PageHead(page_head))
@@ -160,6 +144,45 @@ void put_page(struct page *page)
 }
 EXPORT_SYMBOL(put_page);
+/*
+ * This function is exported but must not be called by anything other
+ * than get_page(). It implements the slow path of get_page().
+ */
+bool __get_page_tail(struct page *page)
+{
+        /*
+         * This takes care of get_page() if run on a tail page
+         * returned by one of the get_user_pages/follow_page variants.
+         * get_user_pages/follow_page itself doesn't need the compound
+         * lock because it runs __get_page_tail_foll() under the
+         * proper PT lock that already serializes against
+         * split_huge_page().
+         */
+        unsigned long flags;
+        bool got = false;
+        struct page *page_head = compound_trans_head(page);
+        if (likely(page != page_head && get_page_unless_zero(page_head))) {
+                /*
+                 * page_head wasn't a dangling pointer but it
+                 * may not be a head page anymore by the time
+                 * we obtain the lock. That is ok as long as it
+                 * can't be freed from under us.
+                 */
+                flags = compound_lock_irqsave(page_head);
+                /* here __split_huge_page_refcount won't run anymore */
+                if (likely(PageTail(page))) {
+                        __get_page_tail_foll(page, false);
+                        got = true;
+                }
+                compound_unlock_irqrestore(page_head, flags);
+                if (unlikely(!got))
+                        put_page(page_head);
+        }
+        return got;
+}
+EXPORT_SYMBOL(__get_page_tail);
 /**
 * put_pages_list() - release a list of pages
 * @pages: list of pages threaded on page->lru
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ff8dc1a18cb..17bc224bce6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1681,19 +1681,14 @@ out:
 }
 #ifdef CONFIG_PROC_FS
-struct proc_swaps {
-        struct seq_file seq;
-        int event;
-};
 static unsigned swaps_poll(struct file *file, poll_table *wait)
 {
-        struct proc_swaps *s = file->private_data;
+        struct seq_file *seq = file->private_data;
        poll_wait(file, &proc_poll_wait, wait);
-        if (s->event != atomic_read(&proc_poll_event)) {
+        if (seq->poll_event != atomic_read(&proc_poll_event)) {
-                s->event = atomic_read(&proc_poll_event);
+                seq->poll_event = atomic_read(&proc_poll_event);
                return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
        }
@@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = {
 static int swaps_open(struct inode *inode, struct file *file)
 {
-        struct proc_swaps *s;
+        struct seq_file *seq;
        int ret;
-        s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
-        if (!s)
-                return -ENOMEM;
-        file->private_data = s;
        ret = seq_open(file, &swaps_op);
-        if (ret) {
+        if (ret)
-                kfree(s);
                return ret;
-        }
-        s->seq.private = s;
+        seq = file->private_data;
-        s->event = atomic_read(&proc_poll_event);
+        seq->poll_event = atomic_read(&proc_poll_event);
-        return ret;
+        return 0;
 }
 static const struct file_operations proc_swaps_operations = {
@@ -1937,20 +1924,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
        /*
         * Find out how many pages are allowed for a single swap
-         * device. There are two limiting factors: 1) the number of
+         * device. There are three limiting factors: 1) the number
-         * bits for the swap offset in the swp_entry_t type and
+         * of bits for the swap offset in the swp_entry_t type, and
-         * 2) the number of bits in the a swap pte as defined by
+         * 2) the number of bits in the swap pte as defined by the
-         * the different architectures. In order to find the
+         * the different architectures, and 3) the number of free bits
-         * largest possible bit mask a swap entry with swap type 0
+         * in an exceptional radix_tree entry. In order to find the
+         * largest possible bit mask, a swap entry with swap type 0
         * and swap offset ~0UL is created, encoded to a swap pte,
-         * decoded to a swp_entry_t again and finally the swap
+         * decoded to a swp_entry_t again, and finally the swap
         * offset is extracted. This will mask all the bits from
         * the initial ~0UL mask that can't be encoded in either
         * the swp_entry_t or the architecture definition of a
-         * swap pte.
+         * swap pte.  Then the same is done for a radix_tree entry.
         */
        maxpages = swp_offset(pte_to_swp_entry(
-                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+                        swp_entry_to_pte(swp_entry(0, ~0UL))));
+        maxpages = swp_offset(radix_to_swp_entry(
+                        swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
        if (maxpages > swap_header->info.last_page) {
                maxpages = swap_header->info.last_page + 1;
                /* p->max is an unsigned int: don't overflow it */
diff --git a/mm/thrash.c b/mm/thrash.c
index fabf2d0f516..e53f7d02c17 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -6,7 +6,7 @@
 * Released under the GPL, see the file COPYING for details.
 *
 * Simple token based thrashing protection, using the algorithm
- * described in:  http://www.cs.wm.edu/~sjiang/token.pdf
+ * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
 *
 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
 * Improved algorithm to pass token:
@@ -30,8 +30,6 @@
 static DEFINE_SPINLOCK(swap_token_lock);
 struct mm_struct *swap_token_mm;
 struct mem_cgroup *swap_token_memcg;
-static unsigned int global_faults;
-static unsigned int last_aging;
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
@@ -55,6 +53,8 @@ void grab_swap_token(struct mm_struct *mm)
 {
        int current_interval;
        unsigned int old_prio = mm->token_priority;
+        static unsigned int global_faults;
+        static unsigned int last_aging;
        global_faults++;
@@ -67,6 +67,17 @@ void grab_swap_token(struct mm_struct *mm)
        if (!swap_token_mm)
                goto replace_token;
+        /*
+         * Usually, we don't need priority aging because long interval faults
+         * makes priority decrease quickly. But there is one exception. If the
+         * token owner task is sleeping, it never make long interval faults.
+         * Thus, we need a priority aging mechanism instead. The requirements
+         * of priority aging are
+         *  1) An aging interval is reasonable enough long. Too short aging
+         *     interval makes quick swap token lost and decrease performance.
+         *  2) The swap token owner task have to get priority aging even if
+         *     it's under sleep.
+         */
        if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
                swap_token_mm->token_priority /= 2;
                last_aging = global_faults;
diff --git a/mm/truncate.c b/mm/truncate.c
index e13f22efaad..b40ac6d4e86 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -199,9 +199,6 @@ int invalidate_inode_page(struct page *page)
 * The first pass will remove most pages, so the search cost of the second pass
 * is low.
 *
- * When looking at page->index outside the page lock we need to be careful to
- * copy it into a local to avoid races (it could change at any time).
- *
 * We pass down the cache-hot hint to the page freeing code.  Even if the
 * mapping is large, it is probably the case that the final pages are the most
 * recently touched, and freeing happens in ascending file offset order.
@@ -210,10 +207,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
                                loff_t lstart, loff_t lend)
 {
        const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
-        pgoff_t end;
        const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
        struct pagevec pvec;
-        pgoff_t next;
+        pgoff_t index;
+        pgoff_t end;
        int i;
        cleancache_flush_inode(mapping);
@@ -224,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping,
        end = (lend >> PAGE_CACHE_SHIFT);
        pagevec_init(&pvec, 0);
-        next = start;
+        index = start;
-        while (next <= end &&
+        while (index <= end && pagevec_lookup(&pvec, mapping, index,
-               pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
-                        pgoff_t page_index = page->index;
-                        if (page_index > end) {
+                        /* We rely upon deletion not changing page->index */
-                                next = page_index;
+                        index = page->index;
+                        if (index > end)
                                break;
-                        }
-                        if (page_index > next)
-                                next = page_index;
-                        next++;
                        if (!trylock_page(page))
                                continue;
+                        WARN_ON(page->index != index);
                        if (PageWriteback(page)) {
                                unlock_page(page);
                                continue;
@@ -252,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                pagevec_release(&pvec);
                mem_cgroup_uncharge_end();
                cond_resched();
+                index++;
        }
        if (partial) {
@@ -264,16 +259,17 @@ void truncate_inode_pages_range(struct address_space *mapping,
                }
        }
-        next = start;
+        index = start;
        for ( ; ; ) {
                cond_resched();
-                if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+                if (!pagevec_lookup(&pvec, mapping, index,
-                        if (next == start)
+                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+                        if (index == start)
                                break;
-                        next = start;
+                        index = start;
                        continue;
                }
-                if (pvec.pages[0]->index > end) {
+                if (index == start && pvec.pages[0]->index > end) {
                        pagevec_release(&pvec);
                        break;
                }
@@ -281,18 +277,20 @@ void truncate_inode_pages_range(struct address_space *mapping,
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
-                        if (page->index > end)
+                        /* We rely upon deletion not changing page->index */
+                        index = page->index;
+                        if (index > end)
                                break;
                        lock_page(page);
+                        WARN_ON(page->index != index);
                        wait_on_page_writeback(page);
                        truncate_inode_page(mapping, page);
-                        if (page->index > next)
-                                next = page->index;
-                        next++;
                        unlock_page(page);
                }
                pagevec_release(&pvec);
                mem_cgroup_uncharge_end();
+                index++;
        }
        cleancache_flush_inode(mapping);
 }
@@ -333,35 +331,34 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t end)
 {
        struct pagevec pvec;
-        pgoff_t next = start;
+        pgoff_t index = start;
        unsigned long ret;
        unsigned long count = 0;
        int i;
+        /*
+         * Note: this function may get called on a shmem/tmpfs mapping:
+         * pagevec_lookup() might then return 0 prematurely (because it
+         * got a gangful of swap entries); but it's hardly worth worrying
+         * about - it can rarely have anything to free from such a mapping
+         * (most pages are dirty), and already skips over any difficulties.
+         */
        pagevec_init(&pvec, 0);
-        while (next <= end &&
+        while (index <= end && pagevec_lookup(&pvec, mapping, index,
-                        pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
-                        pgoff_t index;
-                        int lock_failed;
-                        lock_failed = !trylock_page(page);
+                        /* We rely upon deletion not changing page->index */
-                        /*
-                         * We really shouldn't be looking at the ->index of an
-                         * unlocked page.  But we're not allowed to lock these
-                         * pages.  So we rely upon nobody altering the ->index
-                         * of this (pinned-by-us) page.
-                         */
                        index = page->index;
-                        if (index > next)
+                        if (index > end)
-                                next = index;
+                                break;
-                        next++;
-                        if (lock_failed)
-                                continue;
+                        if (!trylock_page(page))
+                                continue;
+                        WARN_ON(page->index != index);
                        ret = invalidate_inode_page(page);
                        unlock_page(page);
                        /*
@@ -371,12 +368,11 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
                        if (!ret)
                                deactivate_page(page);
                        count += ret;
-                        if (next > end)
-                                break;
                }
                pagevec_release(&pvec);
                mem_cgroup_uncharge_end();
                cond_resched();
+                index++;
        }
        return count;
 }
@@ -442,37 +438,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                                  pgoff_t start, pgoff_t end)
 {
        struct pagevec pvec;
-        pgoff_t next;
+        pgoff_t index;
        int i;
        int ret = 0;
        int ret2 = 0;
        int did_range_unmap = 0;
-        int wrapped = 0;
        cleancache_flush_inode(mapping);
        pagevec_init(&pvec, 0);
-        next = start;
+        index = start;
-        while (next <= end && !wrapped &&
+        while (index <= end && pagevec_lookup(&pvec, mapping, index,
-                pagevec_lookup(&pvec, mapping, next,
+                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
-                        min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
-                        pgoff_t page_index;
+                        /* We rely upon deletion not changing page->index */
+                        index = page->index;
+                        if (index > end)
+                                break;
                        lock_page(page);
+                        WARN_ON(page->index != index);
                        if (page->mapping != mapping) {
                                unlock_page(page);
                                continue;
                        }
-                        page_index = page->index;
-                        next = page_index + 1;
-                        if (next == 0)
-                                wrapped = 1;
-                        if (page_index > end) {
-                                unlock_page(page);
-                                break;
-                        }
                        wait_on_page_writeback(page);
                        if (page_mapped(page)) {
                                if (!did_range_unmap) {
@@ -480,9 +471,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                                         * Zap the rest of the file in one hit.
                                         */
                                        unmap_mapping_range(mapping,
-                                           (loff_t)page_index<<PAGE_CACHE_SHIFT,
+                                           (loff_t)index << PAGE_CACHE_SHIFT,
-                                           (loff_t)(end - page_index + 1)
+                                           (loff_t)(1 + end - index)
-                                                        << PAGE_CACHE_SHIFT,
+                                                         << PAGE_CACHE_SHIFT,
                                            0);
                                        did_range_unmap = 1;
                                } else {
@@ -490,8 +481,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                                         * Just zap this page
                                         */
                                        unmap_mapping_range(mapping,
-                                          (loff_t)page_index<<PAGE_CACHE_SHIFT,
+                                           (loff_t)index << PAGE_CACHE_SHIFT,
-                                          PAGE_CACHE_SIZE, 0);
+                                           PAGE_CACHE_SIZE, 0);
                                }
                        }
                        BUG_ON(page_mapped(page));
@@ -507,6 +498,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                pagevec_release(&pvec);
                mem_cgroup_uncharge_end();
                cond_resched();
+                index++;
        }
        cleancache_flush_inode(mapping);
        return ret;
@@ -531,8 +523,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
 /**
 * truncate_pagecache - unmap and remove pagecache that has been truncated
 * @inode: inode
- * @old: old file offset
+ * @oldsize: old file size
- * @new: new file offset
+ * @newsize: new file size
 *
 * inode's new i_size must already be written before truncate_pagecache
 * is called.
@@ -544,9 +536,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
 * situations such as writepage being called for a page that has already
 * had its underlying blocks deallocated.
 */
-void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
+void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize)
 {
        struct address_space *mapping = inode->i_mapping;
+        loff_t holebegin = round_up(newsize, PAGE_SIZE);
        /*
         * unmap_mapping_range is called twice, first simply for
@@ -557,9 +550,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
         * truncate_inode_pages finishes, hence the second
         * unmap_mapping_range call must be made for correctness.
         */
-        unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+        unmap_mapping_range(mapping, holebegin, 0, 1);
-        truncate_inode_pages(mapping, new);
+        truncate_inode_pages(mapping, newsize);
-        unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+        unmap_mapping_range(mapping, holebegin, 0, 1);
 }
 EXPORT_SYMBOL(truncate_pagecache);
@@ -589,29 +582,31 @@ EXPORT_SYMBOL(truncate_setsize);
 /**
 * vmtruncate - unmap mappings "freed" by truncate() syscall
 * @inode: inode of the file used
- * @offset: file offset to start truncating
+ * @newsize: file offset to start truncating
 *
 * This function is deprecated and truncate_setsize or truncate_pagecache
 * should be used instead, together with filesystem specific block truncation.
 */
-int vmtruncate(struct inode *inode, loff_t offset)
+int vmtruncate(struct inode *inode, loff_t newsize)
 {
        int error;
-        error = inode_newsize_ok(inode, offset);
+        error = inode_newsize_ok(inode, newsize);
        if (error)
                return error;
-        truncate_setsize(inode, offset);
+        truncate_setsize(inode, newsize);
        if (inode->i_op->truncate)
                inode->i_op->truncate(inode);
        return 0;
 }
 EXPORT_SYMBOL(vmtruncate);
-int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
        struct address_space *mapping = inode->i_mapping;
+        loff_t holebegin = round_up(lstart, PAGE_SIZE);
+        loff_t holelen = 1 + lend - holebegin;
        /*
         * If the underlying filesystem is not going to provide
@@ -622,12 +617,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
                return -ENOSYS;
        mutex_lock(&inode->i_mutex);
-        down_write(&inode->i_alloc_sem);
+        inode_dio_wait(inode);
-        unmap_mapping_range(mapping, offset, (end - offset), 1);
+        unmap_mapping_range(mapping, holebegin, holelen, 1);
-        inode->i_op->truncate_range(inode, offset, end);
+        inode->i_op->truncate_range(inode, lstart, lend);
        /* unmap again to remove racily COWed private pages */
-        unmap_mapping_range(mapping, offset, (end - offset), 1);
+        unmap_mapping_range(mapping, holebegin, holelen, 1);
-        up_write(&inode->i_alloc_sem);
        mutex_unlock(&inode->i_mutex);
        return 0;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1d34d75366a..3a65d6f7422 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -26,7 +26,7 @@
 #include <linux/rcupdate.h>
 #include <linux/pfn.h>
 #include <linux/kmemleak.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
@@ -452,13 +452,6 @@ overflow:
        return ERR_PTR(-EBUSY);
 }
-static void rcu_free_va(struct rcu_head *head)
-{
-        struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
-        kfree(va);
-}
 static void __free_vmap_area(struct vmap_area *va)
 {
        BUG_ON(RB_EMPTY_NODE(&va->rb_node));
@@ -491,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va)
        if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
                vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
-        call_rcu(&va->rcu_head, rcu_free_va);
+        kfree_rcu(va, rcu_head);
 }
 /*
@@ -732,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr)
 #define VMAP_BBMAP_BITS_MIN     (VMAP_MAX_ALLOC*2)
 #define VMAP_MIN(x, y)          ((x) < (y) ? (x) : (y)) /* can't use min() */
 #define VMAP_MAX(x, y)          ((x) > (y) ? (x) : (y)) /* can't use max() */
-#define VMAP_BBMAP_BITS         VMAP_MIN(VMAP_BBMAP_BITS_MAX,           \
+#define VMAP_BBMAP_BITS         \
-                                        VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
+                VMAP_MIN(VMAP_BBMAP_BITS_MAX,   \
-                                                VMALLOC_PAGES / NR_CPUS / 16))
+                VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
+                        VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
 #define VMAP_BLOCK_SIZE         (VMAP_BBMAP_BITS * PAGE_SIZE)
@@ -837,13 +831,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        return vb;
 }
-static void rcu_free_vb(struct rcu_head *head)
-{
-        struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
-        kfree(vb);
-}
 static void free_vmap_block(struct vmap_block *vb)
 {
        struct vmap_block *tmp;
@@ -856,7 +843,7 @@ static void free_vmap_block(struct vmap_block *vb)
        BUG_ON(tmp != vb);
        free_vmap_area_noflush(vb->va);
-        call_rcu(&vb->rcu_head, rcu_free_vb);
+        kfree_rcu(vb, rcu_head);
 }
 static void purge_fragmented_blocks(int cpu)
@@ -1266,18 +1253,22 @@ EXPORT_SYMBOL_GPL(map_vm_area);
 DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
-static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
                              unsigned long flags, void *caller)
 {
-        struct vm_struct *tmp, **p;
        vm->flags = flags;
        vm->addr = (void *)va->va_start;
        vm->size = va->va_end - va->va_start;
        vm->caller = caller;
        va->private = vm;
        va->flags |= VM_VM_AREA;
+}
+static void insert_vmalloc_vmlist(struct vm_struct *vm)
+{
+        struct vm_struct *tmp, **p;
+        vm->flags &= ~VM_UNLIST;
        write_lock(&vmlist_lock);
        for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
                if (tmp->addr >= vm->addr)
@@ -1288,6 +1279,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
        write_unlock(&vmlist_lock);
 }
+static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+                              unsigned long flags, void *caller)
+{
+        setup_vmalloc_vm(vm, va, flags, caller);
+        insert_vmalloc_vmlist(vm);
+}
 static struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long flags, unsigned long start,
                unsigned long end, int node, gfp_t gfp_mask, void *caller)
@@ -1326,7 +1324,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
                return NULL;
        }
-        insert_vmalloc_vm(area, va, flags, caller);
+        /*
+         * When this function is called from __vmalloc_node_range,
+         * we do not add vm_struct to vmlist here to avoid
+         * accessing uninitialized members of vm_struct such as
+         * pages and nr_pages fields. They will be set later.
+         * To distinguish it from others, we use a VM_UNLIST flag.
+         */
+        if (flags & VM_UNLIST)
+                setup_vmalloc_vm(area, va, flags, caller);
+        else
+                insert_vmalloc_vm(area, va, flags, caller);
        return area;
 }
@@ -1394,17 +1403,20 @@ struct vm_struct *remove_vm_area(const void *addr)
        va = find_vmap_area((unsigned long)addr);
        if (va && va->flags & VM_VM_AREA) {
                struct vm_struct *vm = va->private;
-                struct vm_struct *tmp, **p;
-                /*
+                if (!(vm->flags & VM_UNLIST)) {
-                 * remove from list and disallow access to this vm_struct
+                        struct vm_struct *tmp, **p;
-                 * before unmap. (address range confliction is maintained by
+                        /*
-                 * vmap.)
+                         * remove from list and disallow access to
-                 */
+                         * this vm_struct before unmap. (address range
-                write_lock(&vmlist_lock);
+                         * confliction is maintained by vmap.)
-                for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
+                         */
-                        ;
+                        write_lock(&vmlist_lock);
-                *p = tmp->next;
+                        for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
-                write_unlock(&vmlist_lock);
+                                ;
+                        *p = tmp->next;
+                        write_unlock(&vmlist_lock);
+                }
                vmap_debug_free_range(va->va_start, va->va_end);
                free_unmap_vmap_area(va);
@@ -1615,13 +1627,21 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
        if (!size || (size >> PAGE_SHIFT) > totalram_pages)
                return NULL;
-        area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
+        area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
-                                  gfp_mask, caller);
+                                  start, end, node, gfp_mask, caller);
        if (!area)
                return NULL;
        addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
+        if (!addr)
+                return NULL;
+        /*
+         * In this function, newly allocated vm_struct is not added
+         * to vmlist at __get_vm_area_node(). so, it is added here.
+         */
+        insert_vmalloc_vmlist(area);
        /*
         * A ref_count = 3 is needed because the vm_struct and vmap_area
@@ -2153,6 +2173,14 @@ struct vm_struct *alloc_vm_area(size_t size)
                return NULL;
        }
+        /*
+         * If the allocated address space is passed to a hypercall
+         * before being used then we cannot rely on a page fault to
+         * trigger an update of the page tables.  So sync all the page
+         * tables here.
+         */
+        vmalloc_sync_all();
        return area;
 }
 EXPORT_SYMBOL_GPL(alloc_vm_area);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d036e59d302..b55699cd906 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -95,8 +95,6 @@ struct scan_control {
        /* Can pages be swapped as part of reclaim? */
        int may_swap;
-        int swappiness;
        int order;
        /*
@@ -173,7 +171,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
                                struct scan_control *sc, enum lru_list lru)
 {
        if (!scanning_global_lru(sc))
-                return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru);
+                return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
+                                zone_to_nid(zone), zone_idx(zone), BIT(lru));
        return zone_page_state(zone, NR_LRU_BASE + lru);
 }
@@ -250,49 +249,90 @@ unsigned long shrink_slab(struct shrink_control *shrink,
                unsigned long long delta;
                unsigned long total_scan;
                unsigned long max_pass;
+                int shrink_ret = 0;
+                long nr;
+                long new_nr;
+                long batch_size = shrinker->batch ? shrinker->batch
+                                                  : SHRINK_BATCH;
+                /*
+                 * copy the current shrinker scan count into a local variable
+                 * and zero it so that other concurrent shrinker invocations
+                 * don't also do this scanning work.
+                 */
+                do {
+                        nr = shrinker->nr;
+                } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+                total_scan = nr;
                max_pass = do_shrinker_shrink(shrinker, shrink, 0);
                delta = (4 * nr_pages_scanned) / shrinker->seeks;
                delta *= max_pass;
                do_div(delta, lru_pages + 1);
-                shrinker->nr += delta;
+                total_scan += delta;
-                if (shrinker->nr < 0) {
+                if (total_scan < 0) {
                        printk(KERN_ERR "shrink_slab: %pF negative objects to "
                               "delete nr=%ld\n",
-                               shrinker->shrink, shrinker->nr);
+                               shrinker->shrink, total_scan);
-                        shrinker->nr = max_pass;
+                        total_scan = max_pass;
                }
                /*
+                 * We need to avoid excessive windup on filesystem shrinkers
+                 * due to large numbers of GFP_NOFS allocations causing the
+                 * shrinkers to return -1 all the time. This results in a large
+                 * nr being built up so when a shrink that can do some work
+                 * comes along it empties the entire cache due to nr >>>
+                 * max_pass.  This is bad for sustaining a working set in
+                 * memory.
+                 *
+                 * Hence only allow the shrinker to scan the entire cache when
+                 * a large delta change is calculated directly.
+                 */
+                if (delta < max_pass / 4)
+                        total_scan = min(total_scan, max_pass / 2);
+                /*
                 * Avoid risking looping forever due to too large nr value:
                 * never try to free more than twice the estimate number of
                 * freeable entries.
                 */
-                if (shrinker->nr > max_pass * 2)
+                if (total_scan > max_pass * 2)
-                        shrinker->nr = max_pass * 2;
+                        total_scan = max_pass * 2;
-                total_scan = shrinker->nr;
+                trace_mm_shrink_slab_start(shrinker, shrink, nr,
-                shrinker->nr = 0;
+                                        nr_pages_scanned, lru_pages,
+                                        max_pass, delta, total_scan);
-                while (total_scan >= SHRINK_BATCH) {
+                while (total_scan >= batch_size) {
-                        long this_scan = SHRINK_BATCH;
-                        int shrink_ret;
                        int nr_before;
                        nr_before = do_shrinker_shrink(shrinker, shrink, 0);
                        shrink_ret = do_shrinker_shrink(shrinker, shrink,
-                                                        this_scan);
+                                                        batch_size);
                        if (shrink_ret == -1)
                                break;
                        if (shrink_ret < nr_before)
                                ret += nr_before - shrink_ret;
-                        count_vm_events(SLABS_SCANNED, this_scan);
+                        count_vm_events(SLABS_SCANNED, batch_size);
-                        total_scan -= this_scan;
+                        total_scan -= batch_size;
                        cond_resched();
                }
-                shrinker->nr += total_scan;
+                /*
+                 * move the unused scan count back into the shrinker in a
+                 * manner that handles concurrent updates. If we exhausted the
+                 * scan, there is no need to do an update.
+                 */
+                do {
+                        nr = shrinker->nr;
+                        new_nr = total_scan + nr;
+                        if (total_scan <= 0)
+                                break;
+                } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+                trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
        }
        up_read(&shrinker_rwsem);
 out:
@@ -1729,6 +1769,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
        return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
 }
+static int vmscan_swappiness(struct scan_control *sc)
+{
+        if (scanning_global_lru(sc))
+                return vm_swappiness;
+        return mem_cgroup_swappiness(sc->mem_cgroup);
+}
 /*
 * Determine how aggressively the anon and file LRU lists should be
 * scanned.  The relative value of each set of LRU lists is determined
@@ -1747,22 +1794,15 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        u64 fraction[2], denominator;
        enum lru_list l;
        int noswap = 0;
-        int force_scan = 0;
+        bool force_scan = false;
+        unsigned long nr_force_scan[2];
+        /* kswapd does zone balancing and needs to scan this zone */
-        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+        if (scanning_global_lru(sc) && current_is_kswapd())
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+                force_scan = true;
-        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+        /* memcg may have small limit and need to avoid priority drop */
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+        if (!scanning_global_lru(sc))
+                force_scan = true;
-        if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
-                /* kswapd does zone balancing and need to scan this zone */
-                if (scanning_global_lru(sc) && current_is_kswapd())
-                        force_scan = 1;
-                /* memcg may have small limit and need to avoid priority drop */
-                if (!scanning_global_lru(sc))
-                        force_scan = 1;
-        }
        /* If we have no swap space, do not bother scanning anon pages. */
        if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1770,9 +1810,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                fraction[0] = 0;
                fraction[1] = 1;
                denominator = 1;
+                nr_force_scan[0] = 0;
+                nr_force_scan[1] = SWAP_CLUSTER_MAX;
                goto out;
        }
+        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
        if (scanning_global_lru(sc)) {
                free  = zone_page_state(zone, NR_FREE_PAGES);
                /* If we have very few page cache pages,
@@ -1781,6 +1828,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                        fraction[0] = 1;
                        fraction[1] = 0;
                        denominator = 1;
+                        nr_force_scan[0] = SWAP_CLUSTER_MAX;
+                        nr_force_scan[1] = 0;
                        goto out;
                }
        }
@@ -1789,8 +1838,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
         * With swappiness at 100, anonymous and file have the same priority.
         * This scanning priority is essentially the inverse of IO cost.
         */
-        anon_prio = sc->swappiness;
+        anon_prio = vmscan_swappiness(sc);
-        file_prio = 200 - sc->swappiness;
+        file_prio = 200 - vmscan_swappiness(sc);
        /*
         * OK, so we have swap space and a fair amount of page cache
@@ -1829,6 +1878,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        fraction[0] = ap;
        fraction[1] = fp;
        denominator = ap + fp + 1;
+        if (force_scan) {
+                unsigned long scan = SWAP_CLUSTER_MAX;
+                nr_force_scan[0] = div64_u64(scan * ap, denominator);
+                nr_force_scan[1] = div64_u64(scan * fp, denominator);
+        }
 out:
        for_each_evictable_lru(l) {
                int file = is_file_lru(l);
@@ -1849,12 +1903,8 @@ out:
                 * memcg, priority drop can cause big latency. So, it's better
                 * to scan small amount. See may_noscan above.
                 */
-                if (!scan && force_scan) {
+                if (!scan && force_scan)
-                        if (file)
+                        scan = nr_force_scan[file];
-                                scan = SWAP_CLUSTER_MAX;
-                        else if (!noswap)
-                                scan = SWAP_CLUSTER_MAX;
-                }
                nr[l] = scan;
        }
 }
@@ -2179,7 +2229,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .may_unmap = 1,
                .may_swap = 1,
-                .swappiness = vm_swappiness,
                .order = order,
                .mem_cgroup = NULL,
                .nodemask = nodemask,
@@ -2203,7 +2252,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
-                                                unsigned int swappiness,
                                                struct zone *zone,
                                                unsigned long *nr_scanned)
 {
@@ -2213,7 +2261,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = !noswap,
-                .swappiness = swappiness,
                .order = 0,
                .mem_cgroup = mem,
        };
@@ -2242,8 +2289,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                                           gfp_t gfp_mask,
-                                           bool noswap,
+                                           bool noswap)
-                                           unsigned int swappiness)
 {
        struct zonelist *zonelist;
        unsigned long nr_reclaimed;
@@ -2253,7 +2299,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .may_unmap = 1,
                .may_swap = !noswap,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
-                .swappiness = swappiness,
                .order = 0,
                .mem_cgroup = mem_cont,
                .nodemask = NULL, /* we don't care the placement */
@@ -2404,7 +2449,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                 * we want to put equal scanning pressure on each zone.
                 */
                .nr_to_reclaim = ULONG_MAX,
-                .swappiness = vm_swappiness,
                .order = order,
                .mem_cgroup = NULL,
        };
@@ -2453,6 +2497,9 @@ loop_again:
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
                                break;
+                        } else {
+                                /* If balanced, clear the congested flag */
+                                zone_clear_flag(zone, ZONE_CONGESTED);
                        }
                }
                if (i < 0)
@@ -2874,7 +2921,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
                .may_writepage = 1,
                .nr_to_reclaim = nr_to_reclaim,
                .hibernation_mode = 1,
-                .swappiness = vm_swappiness,
                .order = 0,
        };
        struct shrink_control shrink = {
@@ -3061,7 +3107,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .nr_to_reclaim = max_t(unsigned long, nr_pages,
                                       SWAP_CLUSTER_MAX),
                .gfp_mask = gfp_mask,
-                .swappiness = vm_swappiness,
                .order = order,
        };
        struct shrink_control shrink = {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7694b..d52b13d28e8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -659,7 +659,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
 }
 #endif
-#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS)
+#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
 #ifdef CONFIG_ZONE_DMA
 #define TEXT_FOR_DMA(xx) xx "_dma",
 #else
@@ -788,7 +788,7 @@ const char * const vmstat_text[] = {
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
 };
-#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */
+#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
 #ifdef CONFIG_PROC_FS