41 files changed, 2057 insertions, 1129 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e9c0c61f2ddd..8ca47a5ee9c8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -347,3 +347,26 @@ config NEED_PER_CPU_KM
        depends on !SMP
        bool
        default y
+config CLEANCACHE
+        bool "Enable cleancache driver to cache clean pages if tmem is present"
+        default n
+        help
+          Cleancache can be thought of as a page-granularity victim cache
+          for clean pages that the kernel's pageframe replacement algorithm
+          (PFRA) would like to keep around, but can't since there isn't enough
+          memory.  So when the PFRA "evicts" a page, it first attempts to use
+          cleancacne code to put the data contained in that page into
+          "transcendent memory", memory that is not directly accessible or
+          addressable by the kernel and is of unknown and possibly
+          time-varying size.  And when a cleancache-enabled
+          filesystem wishes to access a page in a file on disk, it first
+          checks cleancache to see if it already contains it; if it does,
+          the page is copied into the kernel and a disk access is avoided.
+          When a transcendent memory driver is available (such as zcache or
+          Xen transcendent memory), a significant I/O reduction
+          may be achieved.  When none is available, all cleancache calls
+          are reduced to a single pointer-compare-against-NULL resulting
+          in a negligible performance hit.
+          If unsure, say Y to enable cleancache
diff --git a/mm/Makefile b/mm/Makefile
index 42a8326c3e3d..836e4163c1bf 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -49,3 +49,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_CLEANCACHE) += cleancache.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index befc87531e4f..f032e6e1e09a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -63,10 +63,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        unsigned long bdi_thresh;
-        unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
+        unsigned long nr_dirty, nr_io, nr_more_io;
        struct inode *inode;
-        nr_wb = nr_dirty = nr_io = nr_more_io = 0;
+        nr_dirty = nr_io = nr_more_io = 0;
        spin_lock(&inode_wb_list_lock);
        list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
                nr_dirty++;
diff --git a/mm/cleancache.c b/mm/cleancache.c
new file mode 100644
index 000000000000..bcaae4c2a770
--- /dev/null
+++ b/mm/cleancache.c
@@ -0,0 +1,244 @@
+/*
+ * Cleancache frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of cleancache.  See
+ * Documentation/vm/cleancache.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/mm.h>
+#include <linux/cleancache.h>
+/*
+ * This global enablement flag may be read thousands of times per second
+ * by cleancache_get/put/flush even on systems where cleancache_ops
+ * is not claimed (e.g. cleancache is config'ed on but remains
+ * disabled), so is preferred to the slower alternative: a function
+ * call that checks a non-global.
+ */
+int cleancache_enabled;
+EXPORT_SYMBOL(cleancache_enabled);
+/*
+ * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * to the cleancache "backend" implementation functions.
+ */
+static struct cleancache_ops cleancache_ops;
+/* useful stats available in /sys/kernel/mm/cleancache */
+static unsigned long cleancache_succ_gets;
+static unsigned long cleancache_failed_gets;
+static unsigned long cleancache_puts;
+static unsigned long cleancache_flushes;
+/*
+ * register operations for cleancache, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
+{
+        struct cleancache_ops old = cleancache_ops;
+        cleancache_ops = *ops;
+        cleancache_enabled = 1;
+        return old;
+}
+EXPORT_SYMBOL(cleancache_register_ops);
+/* Called by a cleancache-enabled filesystem at time of mount */
+void __cleancache_init_fs(struct super_block *sb)
+{
+        sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_fs);
+/* Called by a cleancache-enabled clustered filesystem at time of mount */
+void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+{
+        sb->cleancache_poolid =
+                (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_shared_fs);
+/*
+ * If the filesystem uses exportable filehandles, use the filehandle as
+ * the key, else use the inode number.
+ */
+static int cleancache_get_key(struct inode *inode,
+                              struct cleancache_filekey *key)
+{
+        int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
+        int len = 0, maxlen = CLEANCACHE_KEY_MAX;
+        struct super_block *sb = inode->i_sb;
+        key->u.ino = inode->i_ino;
+        if (sb->s_export_op != NULL) {
+                fhfn = sb->s_export_op->encode_fh;
+                if  (fhfn) {
+                        struct dentry d;
+                        d.d_inode = inode;
+                        len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
+                        if (len <= 0 || len == 255)
+                                return -1;
+                        if (maxlen > CLEANCACHE_KEY_MAX)
+                                return -1;
+                }
+        }
+        return 0;
+}
+/*
+ * "Get" data from cleancache associated with the poolid/inode/index
+ * that were specified when the data was put to cleanache and, if
+ * successful, use it to fill the specified page with data and return 0.
+ * The pageframe is unchanged and returns -1 if the get fails.
+ * Page must be locked by caller.
+ */
+int __cleancache_get_page(struct page *page)
+{
+        int ret = -1;
+        int pool_id;
+        struct cleancache_filekey key = { .u.key = { 0 } };
+        VM_BUG_ON(!PageLocked(page));
+        pool_id = page->mapping->host->i_sb->cleancache_poolid;
+        if (pool_id < 0)
+                goto out;
+        if (cleancache_get_key(page->mapping->host, &key) < 0)
+                goto out;
+        ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
+        if (ret == 0)
+                cleancache_succ_gets++;
+        else
+                cleancache_failed_gets++;
+out:
+        return ret;
+}
+EXPORT_SYMBOL(__cleancache_get_page);
+/*
+ * "Put" data from a page to cleancache and associate it with the
+ * (previously-obtained per-filesystem) poolid and the page's,
+ * inode and page index.  Page must be locked.  Note that a put_page
+ * always "succeeds", though a subsequent get_page may succeed or fail.
+ */
+void __cleancache_put_page(struct page *page)
+{
+        int pool_id;
+        struct cleancache_filekey key = { .u.key = { 0 } };
+        VM_BUG_ON(!PageLocked(page));
+        pool_id = page->mapping->host->i_sb->cleancache_poolid;
+        if (pool_id >= 0 &&
+              cleancache_get_key(page->mapping->host, &key) >= 0) {
+                (*cleancache_ops.put_page)(pool_id, key, page->index, page);
+                cleancache_puts++;
+        }
+}
+EXPORT_SYMBOL(__cleancache_put_page);
+/*
+ * Flush any data from cleancache associated with the poolid and the
+ * page's inode and page index so that a subsequent "get" will fail.
+ */
+void __cleancache_flush_page(struct address_space *mapping, struct page *page)
+{
+        /* careful... page->mapping is NULL sometimes when this is called */
+        int pool_id = mapping->host->i_sb->cleancache_poolid;
+        struct cleancache_filekey key = { .u.key = { 0 } };
+        if (pool_id >= 0) {
+                VM_BUG_ON(!PageLocked(page));
+                if (cleancache_get_key(mapping->host, &key) >= 0) {
+                        (*cleancache_ops.flush_page)(pool_id, key, page->index);
+                        cleancache_flushes++;
+                }
+        }
+}
+EXPORT_SYMBOL(__cleancache_flush_page);
+/*
+ * Flush all data from cleancache associated with the poolid and the
+ * mappings's inode so that all subsequent gets to this poolid/inode
+ * will fail.
+ */
+void __cleancache_flush_inode(struct address_space *mapping)
+{
+        int pool_id = mapping->host->i_sb->cleancache_poolid;
+        struct cleancache_filekey key = { .u.key = { 0 } };
+        if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
+                (*cleancache_ops.flush_inode)(pool_id, key);
+}
+EXPORT_SYMBOL(__cleancache_flush_inode);
+/*
+ * Called by any cleancache-enabled filesystem at time of unmount;
+ * note that pool_id is surrendered and may be reutrned by a subsequent
+ * cleancache_init_fs or cleancache_init_shared_fs
+ */
+void __cleancache_flush_fs(struct super_block *sb)
+{
+        if (sb->cleancache_poolid >= 0) {
+                int old_poolid = sb->cleancache_poolid;
+                sb->cleancache_poolid = -1;
+                (*cleancache_ops.flush_fs)(old_poolid);
+        }
+}
+EXPORT_SYMBOL(__cleancache_flush_fs);
+#ifdef CONFIG_SYSFS
+/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
+#define CLEANCACHE_SYSFS_RO(_name) \
+        static ssize_t cleancache_##_name##_show(struct kobject *kobj, \
+                                struct kobj_attribute *attr, char *buf) \
+        { \
+                return sprintf(buf, "%lu\n", cleancache_##_name); \
+        } \
+        static struct kobj_attribute cleancache_##_name##_attr = { \
+                .attr = { .name = __stringify(_name), .mode = 0444 }, \
+                .show = cleancache_##_name##_show, \
+        }
+CLEANCACHE_SYSFS_RO(succ_gets);
+CLEANCACHE_SYSFS_RO(failed_gets);
+CLEANCACHE_SYSFS_RO(puts);
+CLEANCACHE_SYSFS_RO(flushes);
+static struct attribute *cleancache_attrs[] = {
+        &cleancache_succ_gets_attr.attr,
+        &cleancache_failed_gets_attr.attr,
+        &cleancache_puts_attr.attr,
+        &cleancache_flushes_attr.attr,
+        NULL,
+};
+static struct attribute_group cleancache_attr_group = {
+        .attrs = cleancache_attrs,
+        .name = "cleancache",
+};
+#endif /* CONFIG_SYSFS */
+static int __init init_cleancache(void)
+{
+#ifdef CONFIG_SYSFS
+        int err;
+        err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
+#endif /* CONFIG_SYSFS */
+        return 0;
+}
+module_init(init_cleancache)
diff --git a/mm/filemap.c b/mm/filemap.c
index c641edf553a9..bcdc393b6580 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <linux/cleancache.h>
 #include "internal.h"
 /*
@@ -58,16 +59,16 @@
 /*
 * Lock ordering:
 *
- *  ->i_mmap_lock               (truncate_pagecache)
+ *  ->i_mmap_mutex              (truncate_pagecache)
 *    ->private_lock            (__free_pte->__set_page_dirty_buffers)
 *      ->swap_lock             (exclusive_swap_page, others)
 *        ->mapping->tree_lock
 *
 *  ->i_mutex
- *    ->i_mmap_lock             (truncate->unmap_mapping_range)
+ *    ->i_mmap_mutex            (truncate->unmap_mapping_range)
 *
 *  ->mmap_sem
- *    ->i_mmap_lock
+ *    ->i_mmap_mutex
 *      ->page_table_lock or pte_lock   (various, mainly in memory.c)
 *        ->mapping->tree_lock  (arch-dependent flush_dcache_mmap_lock)
 *
@@ -84,7 +85,7 @@
 *    sb_lock                   (fs/fs-writeback.c)
 *    ->mapping->tree_lock      (__sync_single_inode)
 *
- *  ->i_mmap_lock
+ *  ->i_mmap_mutex
 *    ->anon_vma.lock           (vma_adjust)
 *
 *  ->anon_vma.lock
@@ -106,7 +107,7 @@
 *
 *  (code doesn't rely on that order, so you could switch it around)
 *  ->tasklist_lock             (memory_failure, collect_procs_ao)
- *    ->i_mmap_lock
+ *    ->i_mmap_mutex
 */
 /*
@@ -118,6 +119,16 @@ void __delete_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
+        /*
+         * if we're uptodate, flush out into the cleancache, otherwise
+         * invalidate any existing cleancache entries.  We can't leave
+         * stale data around in the cleancache once our page is gone
+         */
+        if (PageUptodate(page) && PageMappedToDisk(page))
+                cleancache_put_page(page);
+        else
+                cleancache_flush_page(mapping, page);
        radix_tree_delete(&mapping->page_tree, page->index);
        page->mapping = NULL;
        mapping->nrpages--;
@@ -562,6 +573,17 @@ void wait_on_page_bit(struct page *page, int bit_nr)
 }
 EXPORT_SYMBOL(wait_on_page_bit);
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+        DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+        if (!test_bit(bit_nr, &page->flags))
+                return 0;
+        return __wait_on_bit(page_waitqueue(page), &wait,
+                             sleep_on_page_killable, TASK_KILLABLE);
+}
 /**
 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
 * @page: Page defining the wait queue of interest
@@ -643,15 +665,32 @@ EXPORT_SYMBOL_GPL(__lock_page_killable);
 int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
                         unsigned int flags)
 {
-        if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
+        if (flags & FAULT_FLAG_ALLOW_RETRY) {
-                __lock_page(page);
+                /*
-                return 1;
+                 * CAUTION! In this case, mmap_sem is not released
-        } else {
+                 * even though return 0.
-                if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
+                 */
-                        up_read(&mm->mmap_sem);
+                if (flags & FAULT_FLAG_RETRY_NOWAIT)
+                        return 0;
+                up_read(&mm->mmap_sem);
+                if (flags & FAULT_FLAG_KILLABLE)
+                        wait_on_page_locked_killable(page);
+                else
                        wait_on_page_locked(page);
-                }
                return 0;
+        } else {
+                if (flags & FAULT_FLAG_KILLABLE) {
+                        int ret;
+                        ret = __lock_page_killable(page);
+                        if (ret) {
+                                up_read(&mm->mmap_sem);
+                                return 0;
+                        }
+                } else
+                        __lock_page(page);
+                return 1;
        }
 }
@@ -1528,15 +1567,17 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
        /* If we don't want any read-ahead, don't bother */
        if (VM_RandomReadHint(vma))
                return;
+        if (!ra->ra_pages)
+                return;
-        if (VM_SequentialReadHint(vma) ||
+        if (VM_SequentialReadHint(vma)) {
-                        offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
                page_cache_sync_readahead(mapping, ra, file, offset,
                                          ra->ra_pages);
                return;
        }
-        if (ra->mmap_miss < INT_MAX)
+        /* Avoid banging the cache line if not needed */
+        if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
                ra->mmap_miss++;
        /*
@@ -1550,12 +1591,10 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
         * mmap read-around
         */
        ra_pages = max_sane_readahead(ra->ra_pages);
-        if (ra_pages) {
+        ra->start = max_t(long, 0, offset - ra_pages / 2);
-                ra->start = max_t(long, 0, offset - ra_pages/2);
+        ra->size = ra_pages;
-                ra->size = ra_pages;
+        ra->async_size = ra_pages / 4;
-                ra->async_size = 0;
+        ra_submit(ra, mapping, file);
-                ra_submit(ra, mapping, file);
-        }
 }
 /*
@@ -1622,6 +1661,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                /* No page in the page cache at all */
                do_sync_mmap_readahead(vma, ra, file, offset);
                count_vm_event(PGMAJFAULT);
+                mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
                ret = VM_FAULT_MAJOR;
 retry_find:
                page = find_get_page(mapping, offset);
@@ -1660,7 +1700,6 @@ retry_find:
                return VM_FAULT_SIGBUS;
        }
-        ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
        vmf->page = page;
        return ret | VM_FAULT_LOCKED;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 83364df74a33..93356cd12828 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -183,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
                return;
 retry:
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                mm = vma->vm_mm;
                address = vma->vm_start +
@@ -201,7 +201,7 @@ retry:
                        page_cache_release(page);
                }
        }
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        if (locked) {
                mutex_unlock(&xip_sparse_mutex);
diff --git a/mm/fremap.c b/mm/fremap.c
index ec520c7b28df..b8e0e2d468af 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -211,20 +211,20 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                        }
                        goto out;
                }
-                spin_lock(&mapping->i_mmap_lock);
+                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
                vma->vm_flags |= VM_NONLINEAR;
                vma_prio_tree_remove(vma, &mapping->i_mmap);
                vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
                flush_dcache_mmap_unlock(mapping);
-                spin_unlock(&mapping->i_mmap_lock);
+                mutex_unlock(&mapping->i_mmap_mutex);
        }
        if (vma->vm_flags & VM_LOCKED) {
                /*
                 * drop PG_Mlocked flag for over-mapped range
                 */
-                unsigned int saved_flags = vma->vm_flags;
+                vm_flags_t saved_flags = vma->vm_flags;
                munlock_vma_pages_range(vma, start, start + size);
                vma->vm_flags = saved_flags;
        }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 83326ad66d9b..615d9743a3cb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1139,7 +1139,7 @@ static int __split_huge_page_splitting(struct page *page,
                 * We can't temporarily set the pmd to null in order
                 * to split it, the pmd must remain marked huge at all
                 * times or the VM won't take the pmd_trans_huge paths
-                 * and it won't wait on the anon_vma->root->lock to
+                 * and it won't wait on the anon_vma->root->mutex to
                 * serialize against split_huge_page*.
                 */
                pmdp_splitting_flush_notify(vma, address, pmd);
@@ -1333,7 +1333,7 @@ static int __split_huge_page_map(struct page *page,
        return ret;
 }
-/* must be called with anon_vma->root->lock hold */
+/* must be called with anon_vma->root->mutex hold */
 static void __split_huge_page(struct page *page,
                              struct anon_vma *anon_vma)
 {
@@ -1771,12 +1771,9 @@ static void collapse_huge_page(struct mm_struct *mm,
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 #ifndef CONFIG_NUMA
+        up_read(&mm->mmap_sem);
        VM_BUG_ON(!*hpage);
        new_page = *hpage;
-        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
-                up_read(&mm->mmap_sem);
-                return;
-        }
 #else
        VM_BUG_ON(*hpage);
        /*
@@ -1791,22 +1788,26 @@ static void collapse_huge_page(struct mm_struct *mm,
         */
        new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
                                      node, __GFP_OTHER_NODE);
+        /*
+         * After allocating the hugepage, release the mmap_sem read lock in
+         * preparation for taking it in write mode.
+         */
+        up_read(&mm->mmap_sem);
        if (unlikely(!new_page)) {
-                up_read(&mm->mmap_sem);
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
                return;
        }
+#endif
        count_vm_event(THP_COLLAPSE_ALLOC);
        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
-                up_read(&mm->mmap_sem);
+#ifdef CONFIG_NUMA
                put_page(new_page);
+#endif
                return;
        }
-#endif
-        /* after allocating the hugepage upgrade to mmap_sem write mode */
-        up_read(&mm->mmap_sem);
        /*
         * Prevent all access to pagetables with the exception of
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8ee3bd8ec5b5..f33bb319b73f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -475,7 +475,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
        /* If reserves cannot be used, ensure enough pages are in the pool */
        if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
-                goto err;;
+                goto err;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                MAX_NR_ZONES - 1, nodemask) {
@@ -2205,7 +2205,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        unsigned long sz = huge_page_size(h);
        /*
-         * A page gathering list, protected by per file i_mmap_lock. The
+         * A page gathering list, protected by per file i_mmap_mutex. The
         * lock is used to avoid list corruption from multiple unmapping
         * of the same page since we are using page->lru.
         */
@@ -2274,9 +2274,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                          unsigned long end, struct page *ref_page)
 {
-        spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+        mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
        __unmap_hugepage_range(vma, start, end, ref_page);
-        spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 }
 /*
@@ -2308,7 +2308,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         * this mapping should be shared between all the VMAs,
         * __unmap_hugepage_range() is called as the lock is already held
         */
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                /* Do not unmap the current VMA */
                if (iter_vma == vma)
@@ -2326,7 +2326,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                                address, address + huge_page_size(h),
                                page);
        }
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        return 1;
 }
@@ -2810,7 +2810,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        BUG_ON(address >= end);
        flush_cache_range(vma, address, end);
-        spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+        mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
        spin_lock(&mm->page_table_lock);
        for (; address < end; address += huge_page_size(h)) {
                ptep = huge_pte_offset(mm, address);
@@ -2825,7 +2825,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                }
        }
        spin_unlock(&mm->page_table_lock);
-        spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
        flush_tlb_range(vma, start, end);
 }
@@ -2833,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 int hugetlb_reserve_pages(struct inode *inode,
                                        long from, long to,
                                        struct vm_area_struct *vma,
-                                        int acctflag)
+                                        vm_flags_t vm_flags)
 {
        long ret, chg;
        struct hstate *h = hstate_inode(inode);
@@ -2843,7 +2843,7 @@ int hugetlb_reserve_pages(struct inode *inode,
         * attempt will be made for VM_NORESERVE to allocate a page
         * and filesystem quota without using reserves
         */
-        if (acctflag & VM_NORESERVE)
+        if (vm_flags & VM_NORESERVE)
                return 0;
        /*
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 1d29cdfe8ebb..4019979b2637 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -21,6 +21,5 @@ struct mm_struct init_mm = {
        .mmap_sem       = __RWSEM_INITIALIZER(init_mm.mmap_sem),
        .page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
        .mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
-        .cpu_vm_mask    = CPU_MASK_ALL,
        INIT_MM_CONTEXT(init_mm)
 };
diff --git a/mm/internal.h b/mm/internal.h
index 9d0ced8e505e..d071d380fb49 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -66,6 +66,10 @@ static inline unsigned long page_order(struct page *page)
        return page_private(page);
 }
+/* mm/util.c */
+void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+                struct vm_area_struct *prev, struct rb_node *rb_parent);
 #ifdef CONFIG_MMU
 extern long mlock_vma_pages_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c1d5867543e4..aacee45616fc 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1414,9 +1414,12 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        ++(*pos);
        list_for_each_continue_rcu(n, &object_list) {
-                next_obj = list_entry(n, struct kmemleak_object, object_list);
+                struct kmemleak_object *obj =
-                if (get_object(next_obj))
+                        list_entry(n, struct kmemleak_object, object_list);
+                if (get_object(obj)) {
+                        next_obj = obj;
                        break;
+                }
        }
        put_object(prev_obj);
diff --git a/mm/ksm.c b/mm/ksm.c
index 942dfc73a2ff..d708b3ef2260 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -35,6 +35,7 @@
 #include <linux/ksm.h>
 #include <linux/hash.h>
 #include <linux/freezer.h>
+#include <linux/oom.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -1894,9 +1895,11 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
        if (ksm_run != flags) {
                ksm_run = flags;
                if (flags & KSM_RUN_UNMERGE) {
-                        current->flags |= PF_OOM_ORIGIN;
+                        int oom_score_adj;
+                        oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
                        err = unmerge_and_remove_all_rmap_items();
-                        current->flags &= ~PF_OOM_ORIGIN;
+                        test_set_oom_score_adj(oom_score_adj);
                        if (err) {
                                ksm_run = KSM_RUN_STOP;
                                count = err;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 010f9166fa6e..bd9052a5d3ad 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -94,6 +94,8 @@ enum mem_cgroup_events_index {
        MEM_CGROUP_EVENTS_PGPGIN,       /* # of pages paged in */
        MEM_CGROUP_EVENTS_PGPGOUT,      /* # of pages paged out */
        MEM_CGROUP_EVENTS_COUNT,        /* # of pages paged in/out */
+        MEM_CGROUP_EVENTS_PGFAULT,      /* # of page-faults */
+        MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
        MEM_CGROUP_EVENTS_NSTATS,
 };
 /*
@@ -231,6 +233,11 @@ struct mem_cgroup {
         * reclaimed from.
         */
        int last_scanned_child;
+        int last_scanned_node;
+#if MAX_NUMNODES > 1
+        nodemask_t      scan_nodes;
+        unsigned long   next_scan_node_update;
+#endif
        /*
         * Should the accounting and control be hierarchical, per subtree?
         */
@@ -585,6 +592,16 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
        this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
 }
+void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
+{
+        this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
+}
+void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
+{
+        this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
+}
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
                                            enum mem_cgroup_events_index idx)
 {
@@ -624,18 +641,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        preempt_enable();
 }
+static unsigned long
+mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+{
+        struct mem_cgroup_per_zone *mz;
+        u64 total = 0;
+        int zid;
+        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                mz = mem_cgroup_zoneinfo(mem, nid, zid);
+                total += MEM_CGROUP_ZSTAT(mz, idx);
+        }
+        return total;
+}
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
                                        enum lru_list idx)
 {
-        int nid, zid;
+        int nid;
-        struct mem_cgroup_per_zone *mz;
        u64 total = 0;
        for_each_online_node(nid)
-                for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                total += mem_cgroup_get_zonestat_node(mem, nid, idx);
-                        mz = mem_cgroup_zoneinfo(mem, nid, zid);
-                        total += MEM_CGROUP_ZSTAT(mz, idx);
-                }
        return total;
 }
@@ -813,6 +839,33 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
        return (mem == root_mem_cgroup);
 }
+void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
+{
+        struct mem_cgroup *mem;
+        if (!mm)
+                return;
+        rcu_read_lock();
+        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+        if (unlikely(!mem))
+                goto out;
+        switch (idx) {
+        case PGMAJFAULT:
+                mem_cgroup_pgmajfault(mem, 1);
+                break;
+        case PGFAULT:
+                mem_cgroup_pgfault(mem, 1);
+                break;
+        default:
+                BUG();
+        }
+out:
+        rcu_read_unlock();
+}
+EXPORT_SYMBOL(mem_cgroup_count_vm_event);
 /*
 * Following LRU functions are allowed to be used without PCG_LOCK.
 * Operations are called by routine of global LRU independently from memcg.
@@ -1064,9 +1117,9 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
        return (active > inactive);
 }
-unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
+unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
-                                       struct zone *zone,
+                                                struct zone *zone,
-                                       enum lru_list lru)
+                                                enum lru_list lru)
 {
        int nid = zone_to_nid(zone);
        int zid = zone_idx(zone);
@@ -1075,6 +1128,93 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
        return MEM_CGROUP_ZSTAT(mz, lru);
 }
+#ifdef CONFIG_NUMA
+static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
+                                                        int nid)
+{
+        unsigned long ret;
+        ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) +
+                mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE);
+        return ret;
+}
+static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
+{
+        u64 total = 0;
+        int nid;
+        for_each_node_state(nid, N_HIGH_MEMORY)
+                total += mem_cgroup_node_nr_file_lru_pages(memcg, nid);
+        return total;
+}
+static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
+                                                        int nid)
+{
+        unsigned long ret;
+        ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
+                mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
+        return ret;
+}
+static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
+{
+        u64 total = 0;
+        int nid;
+        for_each_node_state(nid, N_HIGH_MEMORY)
+                total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid);
+        return total;
+}
+static unsigned long
+mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid)
+{
+        return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE);
+}
+static unsigned long
+mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg)
+{
+        u64 total = 0;
+        int nid;
+        for_each_node_state(nid, N_HIGH_MEMORY)
+                total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid);
+        return total;
+}
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+                                                        int nid)
+{
+        enum lru_list l;
+        u64 total = 0;
+        for_each_lru(l)
+                total += mem_cgroup_get_zonestat_node(memcg, nid, l);
+        return total;
+}
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg)
+{
+        u64 total = 0;
+        int nid;
+        for_each_node_state(nid, N_HIGH_MEMORY)
+                total += mem_cgroup_node_nr_lru_pages(memcg, nid);
+        return total;
+}
+#endif /* CONFIG_NUMA */
 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
                                                      struct zone *zone)
 {
@@ -1418,6 +1558,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
        return ret;
 }
+#if MAX_NUMNODES > 1
+/*
+ * Always updating the nodemask is not very good - even if we have an empty
+ * list or the wrong list here, we can start from some node and traverse all
+ * nodes based on the zonelist. So update the list loosely once per 10 secs.
+ *
+ */
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+{
+        int nid;
+        if (time_after(mem->next_scan_node_update, jiffies))
+                return;
+        mem->next_scan_node_update = jiffies + 10*HZ;
+        /* make a nodemask where this memcg uses memory from */
+        mem->scan_nodes = node_states[N_HIGH_MEMORY];
+        for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+                if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
+                    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
+                        continue;
+                if (total_swap_pages &&
+                    (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
+                     mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
+                        continue;
+                node_clear(nid, mem->scan_nodes);
+        }
+}
+/*
+ * Selecting a node where we start reclaim from. Because what we need is just
+ * reducing usage counter, start from anywhere is O,K. Considering
+ * memory reclaim from current node, there are pros. and cons.
+ *
+ * Freeing memory from current node means freeing memory from a node which
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
+ * hit limits, it will see a contention on a node. But freeing from remote
+ * node means more costs for memory reclaim because of memory latency.
+ *
+ * Now, we use round-robin. Better algorithm is welcomed.
+ */
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+        int node;
+        mem_cgroup_may_update_nodemask(mem);
+        node = mem->last_scanned_node;
+        node = next_node(node, mem->scan_nodes);
+        if (node == MAX_NUMNODES)
+                node = first_node(mem->scan_nodes);
+        /*
+         * We call this when we hit limit, not when pages are added to LRU.
+         * No LRU may hold pages because all pages are UNEVICTABLE or
+         * memcg is too small and all pages are not on LRU. In that case,
+         * we use curret node.
+         */
+        if (unlikely(node == MAX_NUMNODES))
+                node = numa_node_id();
+        mem->last_scanned_node = node;
+        return node;
+}
+#else
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+        return 0;
+}
+#endif
 /*
 * Scan the hierarchy if needed to reclaim memory. We remember the last child
 * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1433,7 +1648,8 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                                                struct zone *zone,
                                                gfp_t gfp_mask,
-                                                unsigned long reclaim_options)
+                                                unsigned long reclaim_options,
+                                                unsigned long *total_scanned)
 {
        struct mem_cgroup *victim;
        int ret, total = 0;
@@ -1442,6 +1658,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
        bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
        unsigned long excess;
+        unsigned long nr_scanned;
        excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
@@ -1484,10 +1701,12 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                        continue;
                }
                /* we use swappiness of local cgroup */
-                if (check_soft)
+                if (check_soft) {
                        ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-                                noswap, get_swappiness(victim), zone);
+                                noswap, get_swappiness(victim), zone,
-                else
+                                &nr_scanned);
+                        *total_scanned += nr_scanned;
+                } else
                        ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
                                                noswap, get_swappiness(victim));
                css_put(&victim->css);
@@ -1503,7 +1722,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                        if (!res_counter_soft_limit_excess(&root_mem->res))
                                return total;
                } else if (mem_cgroup_margin(root_mem))
-                        return 1 + total;
+                        return total;
        }
        return total;
 }
@@ -1928,7 +2147,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
                return CHARGE_WOULDBLOCK;
        ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                              gfp_mask, flags);
+                                              gfp_mask, flags, NULL);
        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
                return CHARGE_RETRY;
        /*
@@ -3211,7 +3430,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                        break;
                mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
-                                                MEM_CGROUP_RECLAIM_SHRINK);
+                                                MEM_CGROUP_RECLAIM_SHRINK,
+                                                NULL);
                curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -3271,7 +3491,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
                                                MEM_CGROUP_RECLAIM_NOSWAP |
-                                                MEM_CGROUP_RECLAIM_SHRINK);
+                                                MEM_CGROUP_RECLAIM_SHRINK,
+                                                NULL);
                curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -3285,7 +3506,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 }
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-                                            gfp_t gfp_mask)
+                                            gfp_t gfp_mask,
+                                            unsigned long *total_scanned)
 {
        unsigned long nr_reclaimed = 0;
        struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -3293,6 +3515,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
        int loop = 0;
        struct mem_cgroup_tree_per_zone *mctz;
        unsigned long long excess;
+        unsigned long nr_scanned;
        if (order > 0)
                return 0;
@@ -3311,10 +3534,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                if (!mz)
                        break;
+                nr_scanned = 0;
                reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
                                                gfp_mask,
-                                                MEM_CGROUP_RECLAIM_SOFT);
+                                                MEM_CGROUP_RECLAIM_SOFT,
+                                                &nr_scanned);
                nr_reclaimed += reclaimed;
+                *total_scanned += nr_scanned;
                spin_lock(&mctz->lock);
                /*
@@ -3337,10 +3563,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                                 */
                                next_mz =
                                __mem_cgroup_largest_soft_limit_node(mctz);
-                                if (next_mz == mz) {
+                                if (next_mz == mz)
                                        css_put(&next_mz->mem->css);
-                                        next_mz = NULL;
+                                else /* next_mz == NULL or other memcg */
-                                } else /* next_mz == NULL or other memcg */
                                        break;
                        } while (1);
                }
@@ -3772,6 +3997,8 @@ enum {
        MCS_PGPGIN,
        MCS_PGPGOUT,
        MCS_SWAP,
+        MCS_PGFAULT,
+        MCS_PGMAJFAULT,
        MCS_INACTIVE_ANON,
        MCS_ACTIVE_ANON,
        MCS_INACTIVE_FILE,
@@ -3794,6 +4021,8 @@ struct {
        {"pgpgin", "total_pgpgin"},
        {"pgpgout", "total_pgpgout"},
        {"swap", "total_swap"},
+        {"pgfault", "total_pgfault"},
+        {"pgmajfault", "total_pgmajfault"},
        {"inactive_anon", "total_inactive_anon"},
        {"active_anon", "total_active_anon"},
        {"inactive_file", "total_inactive_file"},
@@ -3822,6 +4051,10 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
                val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
                s->stat[MCS_SWAP] += val * PAGE_SIZE;
        }
+        val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
+        s->stat[MCS_PGFAULT] += val;
+        val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
+        s->stat[MCS_PGMAJFAULT] += val;
        /* per zone stat */
        val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -3845,6 +4078,51 @@ mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
                mem_cgroup_get_local_stat(iter, s);
 }
+#ifdef CONFIG_NUMA
+static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
+{
+        int nid;
+        unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
+        unsigned long node_nr;
+        struct cgroup *cont = m->private;
+        struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
+        total_nr = mem_cgroup_nr_lru_pages(mem_cont);
+        seq_printf(m, "total=%lu", total_nr);
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid);
+                seq_printf(m, " N%d=%lu", nid, node_nr);
+        }
+        seq_putc(m, '\n');
+        file_nr = mem_cgroup_nr_file_lru_pages(mem_cont);
+        seq_printf(m, "file=%lu", file_nr);
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid);
+                seq_printf(m, " N%d=%lu", nid, node_nr);
+        }
+        seq_putc(m, '\n');
+        anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont);
+        seq_printf(m, "anon=%lu", anon_nr);
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid);
+                seq_printf(m, " N%d=%lu", nid, node_nr);
+        }
+        seq_putc(m, '\n');
+        unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont);
+        seq_printf(m, "unevictable=%lu", unevictable_nr);
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont,
+                                                                        nid);
+                seq_printf(m, " N%d=%lu", nid, node_nr);
+        }
+        seq_putc(m, '\n');
+        return 0;
+}
+#endif /* CONFIG_NUMA */
 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
                                 struct cgroup_map_cb *cb)
 {
@@ -3855,6 +4133,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        memset(&mystat, 0, sizeof(mystat));
        mem_cgroup_get_local_stat(mem_cont, &mystat);
        for (i = 0; i < NR_MCS_STAT; i++) {
                if (i == MCS_SWAP && !do_swap_account)
                        continue;
@@ -4278,6 +4557,22 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
        return 0;
 }
+#ifdef CONFIG_NUMA
+static const struct file_operations mem_control_numa_stat_file_operations = {
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
+{
+        struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
+        file->f_op = &mem_control_numa_stat_file_operations;
+        return single_open(file, mem_control_numa_stat_show, cont);
+}
+#endif /* CONFIG_NUMA */
 static struct cftype mem_cgroup_files[] = {
        {
                .name = "usage_in_bytes",
@@ -4341,6 +4636,12 @@ static struct cftype mem_cgroup_files[] = {
                .unregister_event = mem_cgroup_oom_unregister_event,
                .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
        },
+#ifdef CONFIG_NUMA
+        {
+                .name = "numa_stat",
+                .open = mem_control_numa_stat_open,
+        },
+#endif
 };
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4596,6 +4897,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                res_counter_init(&mem->memsw, NULL);
        }
        mem->last_scanned_child = 0;
+        mem->last_scanned_node = MAX_NUMNODES;
        INIT_LIST_HEAD(&mem->oom_notify);
        if (parent)
@@ -4953,8 +5255,7 @@ static void mem_cgroup_clear_mc(void)
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                                struct cgroup *cgroup,
-                                struct task_struct *p,
+                                struct task_struct *p)
-                                bool threadgroup)
 {
        int ret = 0;
        struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
@@ -4993,8 +5294,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
                                struct cgroup *cgroup,
-                                struct task_struct *p,
+                                struct task_struct *p)
-                                bool threadgroup)
 {
        mem_cgroup_clear_mc();
 }
@@ -5112,8 +5412,7 @@ retry:
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct cgroup *cont,
                                struct cgroup *old_cont,
-                                struct task_struct *p,
+                                struct task_struct *p)
-                                bool threadgroup)
 {
        struct mm_struct *mm;
@@ -5131,22 +5430,19 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 #else   /* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                                struct cgroup *cgroup,
-                                struct task_struct *p,
+                                struct task_struct *p)
-                                bool threadgroup)
 {
        return 0;
 }
 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
                                struct cgroup *cgroup,
-                                struct task_struct *p,
+                                struct task_struct *p)
-                                bool threadgroup)
 {
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct cgroup *cont,
                                struct cgroup *old_cont,
-                                struct task_struct *p,
+                                struct task_struct *p)
-                                bool threadgroup)
 {
 }
 #endif
@@ -5169,19 +5465,12 @@ struct cgroup_subsys mem_cgroup_subsys = {
 static int __init enable_swap_account(char *s)
 {
        /* consider enabled if no parameter or 1 is given */
-        if (!(*s) || !strcmp(s, "=1"))
+        if (!strcmp(s, "1"))
                really_do_swap_account = 1;
-        else if (!strcmp(s, "=0"))
+        else if (!strcmp(s, "0"))
                really_do_swap_account = 0;
        return 1;
 }
-__setup("swapaccount", enable_swap_account);
+__setup("swapaccount=", enable_swap_account);
-static int __init disable_swap_account(char *s)
-{
-        printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
-        enable_swap_account("=0");
-        return 1;
-}
-__setup("noswapaccount", disable_swap_account);
 #endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2b9a5eef39e0..5c8f7e08928d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -239,7 +239,11 @@ void shake_page(struct page *p, int access)
        if (access) {
                int nr;
                do {
-                        nr = shrink_slab(1000, GFP_KERNEL, 1000);
+                        struct shrink_control shrink = {
+                                .gfp_mask = GFP_KERNEL,
+                        };
+                        nr = shrink_slab(&shrink, 1000, 1000);
                        if (page_count(p) == 1)
                                break;
                } while (nr > 10);
@@ -429,7 +433,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
         */
        read_lock(&tasklist_lock);
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        for_each_process(tsk) {
                pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -449,7 +453,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
                                add_to_kill(tsk, page, vma, to_kill, tkc);
                }
        }
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        read_unlock(&tasklist_lock);
 }
@@ -1440,16 +1444,12 @@ int soft_offline_page(struct page *page, int flags)
         */
        ret = invalidate_inode_page(page);
        unlock_page(page);
        /*
-         * Drop count because page migration doesn't like raised
-         * counts. The page could get re-allocated, but if it becomes
-         * LRU the isolation will just fail.
         * RED-PEN would be better to keep it isolated here, but we
         * would need to fix isolation locking first.
         */
-        put_page(page);
        if (ret == 1) {
+                put_page(page);
                ret = 0;
                pr_info("soft_offline: %#lx: invalidated\n", pfn);
                goto done;
@@ -1461,6 +1461,11 @@ int soft_offline_page(struct page *page, int flags)
         * handles a large number of cases for us.
         */
        ret = isolate_lru_page(page);
+        /*
+         * Drop page reference which is came from get_any_page()
+         * successful isolate_lru_page() already took another one.
+         */
+        put_page(page);
        if (!ret) {
                LIST_HEAD(pagelist);
diff --git a/mm/memory.c b/mm/memory.c
index 61e66f026563..6953d3926e01 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -182,7 +182,7 @@ void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
 {
        __sync_task_rss_stat(task, mm);
 }
-#else
+#else /* SPLIT_RSS_COUNTING */
 #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
 #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
@@ -191,8 +191,205 @@ static void check_sync_rss_stat(struct task_struct *task)
 {
 }
+#endif /* SPLIT_RSS_COUNTING */
+#ifdef HAVE_GENERIC_MMU_GATHER
+static int tlb_next_batch(struct mmu_gather *tlb)
+{
+        struct mmu_gather_batch *batch;
+        batch = tlb->active;
+        if (batch->next) {
+                tlb->active = batch->next;
+                return 1;
+        }
+        batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+        if (!batch)
+                return 0;
+        batch->next = NULL;
+        batch->nr   = 0;
+        batch->max  = MAX_GATHER_BATCH;
+        tlb->active->next = batch;
+        tlb->active = batch;
+        return 1;
+}
+/* tlb_gather_mmu
+ *      Called to initialize an (on-stack) mmu_gather structure for page-table
+ *      tear-down from @mm. The @fullmm argument is used when @mm is without
+ *      users and we're going to destroy the full address space (exit/execve).
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
+{
+        tlb->mm = mm;
+        tlb->fullmm     = fullmm;
+        tlb->need_flush = 0;
+        tlb->fast_mode  = (num_possible_cpus() == 1);
+        tlb->local.next = NULL;
+        tlb->local.nr   = 0;
+        tlb->local.max  = ARRAY_SIZE(tlb->__pages);
+        tlb->active     = &tlb->local;
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+        tlb->batch = NULL;
+#endif
+}
+void tlb_flush_mmu(struct mmu_gather *tlb)
+{
+        struct mmu_gather_batch *batch;
+        if (!tlb->need_flush)
+                return;
+        tlb->need_flush = 0;
+        tlb_flush(tlb);
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+        tlb_table_flush(tlb);
 #endif
+        if (tlb_fast_mode(tlb))
+                return;
+        for (batch = &tlb->local; batch; batch = batch->next) {
+                free_pages_and_swap_cache(batch->pages, batch->nr);
+                batch->nr = 0;
+        }
+        tlb->active = &tlb->local;
+}
+/* tlb_finish_mmu
+ *      Called at the end of the shootdown operation to free up any resources
+ *      that were required.
+ */
+void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+{
+        struct mmu_gather_batch *batch, *next;
+        tlb_flush_mmu(tlb);
+        /* keep the page table cache within bounds */
+        check_pgt_cache();
+        for (batch = tlb->local.next; batch; batch = next) {
+                next = batch->next;
+                free_pages((unsigned long)batch, 0);
+        }
+        tlb->local.next = NULL;
+}
+/* __tlb_remove_page
+ *      Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
+ *      handling the additional races in SMP caused by other CPUs caching valid
+ *      mappings in their TLBs. Returns the number of free page slots left.
+ *      When out of page slots we must call tlb_flush_mmu().
+ */
+int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+        struct mmu_gather_batch *batch;
+        tlb->need_flush = 1;
+        if (tlb_fast_mode(tlb)) {
+                free_page_and_swap_cache(page);
+                return 1; /* avoid calling tlb_flush_mmu() */
+        }
+        batch = tlb->active;
+        batch->pages[batch->nr++] = page;
+        if (batch->nr == batch->max) {
+                if (!tlb_next_batch(tlb))
+                        return 0;
+        }
+        VM_BUG_ON(batch->nr > batch->max);
+        return batch->max - batch->nr;
+}
+#endif /* HAVE_GENERIC_MMU_GATHER */
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+/*
+ * See the comment near struct mmu_table_batch.
+ */
+static void tlb_remove_table_smp_sync(void *arg)
+{
+        /* Simply deliver the interrupt */
+}
+static void tlb_remove_table_one(void *table)
+{
+        /*
+         * This isn't an RCU grace period and hence the page-tables cannot be
+         * assumed to be actually RCU-freed.
+         *
+         * It is however sufficient for software page-table walkers that rely on
+         * IRQ disabling. See the comment near struct mmu_table_batch.
+         */
+        smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+        __tlb_remove_table(table);
+}
+static void tlb_remove_table_rcu(struct rcu_head *head)
+{
+        struct mmu_table_batch *batch;
+        int i;
+        batch = container_of(head, struct mmu_table_batch, rcu);
+        for (i = 0; i < batch->nr; i++)
+                __tlb_remove_table(batch->tables[i]);
+        free_page((unsigned long)batch);
+}
+void tlb_table_flush(struct mmu_gather *tlb)
+{
+        struct mmu_table_batch **batch = &tlb->batch;
+        if (*batch) {
+                call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
+                *batch = NULL;
+        }
+}
+void tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+        struct mmu_table_batch **batch = &tlb->batch;
+        tlb->need_flush = 1;
+        /*
+         * When there's less then two users of this mm there cannot be a
+         * concurrent page-table walk.
+         */
+        if (atomic_read(&tlb->mm->mm_users) < 2) {
+                __tlb_remove_table(table);
+                return;
+        }
+        if (*batch == NULL) {
+                *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+                if (*batch == NULL) {
+                        tlb_remove_table_one(table);
+                        return;
+                }
+                (*batch)->nr = 0;
+        }
+        (*batch)->tables[(*batch)->nr++] = table;
+        if ((*batch)->nr == MAX_TABLE_BATCH)
+                tlb_table_flush(tlb);
+}
+#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
 /*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
@@ -533,7 +730,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
        add_taint(TAINT_BAD_PAGE);
 }
-static inline int is_cow_mapping(unsigned int flags)
+static inline int is_cow_mapping(vm_flags_t flags)
 {
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
@@ -909,26 +1106,24 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end,
-                                long *zap_work, struct zap_details *details)
+                                struct zap_details *details)
 {
        struct mm_struct *mm = tlb->mm;
-        pte_t *pte;
+        int force_flush = 0;
-        spinlock_t *ptl;
        int rss[NR_MM_COUNTERS];
+        spinlock_t *ptl;
+        pte_t *pte;
+again:
        init_rss_vec(rss);
        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        arch_enter_lazy_mmu_mode();
        do {
                pte_t ptent = *pte;
                if (pte_none(ptent)) {
-                        (*zap_work)--;
                        continue;
                }
-                (*zap_work) -= PAGE_SIZE;
                if (pte_present(ptent)) {
                        struct page *page;
@@ -974,7 +1169,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                        page_remove_rmap(page);
                        if (unlikely(page_mapcount(page) < 0))
                                print_bad_pte(vma, addr, ptent, page);
-                        tlb_remove_page(tlb, page);
+                        force_flush = !__tlb_remove_page(tlb, page);
+                        if (force_flush)
+                                break;
                        continue;
                }
                /*
@@ -995,19 +1192,31 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                print_bad_pte(vma, addr, ptent, NULL);
                }
                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
-        } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+        } while (pte++, addr += PAGE_SIZE, addr != end);
        add_mm_rss_vec(mm, rss);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
+        /*
+         * mmu_gather ran out of room to batch pages, we break out of
+         * the PTE lock to avoid doing the potential expensive TLB invalidate
+         * and page-free while holding it.
+         */
+        if (force_flush) {
+                force_flush = 0;
+                tlb_flush_mmu(tlb);
+                if (addr != end)
+                        goto again;
+        }
        return addr;
 }
 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
-                                long *zap_work, struct zap_details *details)
+                                struct zap_details *details)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -1019,19 +1228,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                        if (next-addr != HPAGE_PMD_SIZE) {
                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
                                split_huge_page_pmd(vma->vm_mm, pmd);
-                        } else if (zap_huge_pmd(tlb, vma, pmd)) {
+                        } else if (zap_huge_pmd(tlb, vma, pmd))
-                                (*zap_work)--;
                                continue;
-                        }
                        /* fall through */
                }
-                if (pmd_none_or_clear_bad(pmd)) {
+                if (pmd_none_or_clear_bad(pmd))
-                        (*zap_work)--;
                        continue;
-                }
+                next = zap_pte_range(tlb, vma, pmd, addr, next, details);
-                next = zap_pte_range(tlb, vma, pmd, addr, next,
+                cond_resched();
-                                                zap_work, details);
+        } while (pmd++, addr = next, addr != end);
-        } while (pmd++, addr = next, (addr != end && *zap_work > 0));
        return addr;
 }
@@ -1039,7 +1244,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
-                                long *zap_work, struct zap_details *details)
+                                struct zap_details *details)
 {
        pud_t *pud;
        unsigned long next;
@@ -1047,13 +1252,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
-                if (pud_none_or_clear_bad(pud)) {
+                if (pud_none_or_clear_bad(pud))
-                        (*zap_work)--;
                        continue;
-                }
+                next = zap_pmd_range(tlb, vma, pud, addr, next, details);
-                next = zap_pmd_range(tlb, vma, pud, addr, next,
+        } while (pud++, addr = next, addr != end);
-                                                zap_work, details);
-        } while (pud++, addr = next, (addr != end && *zap_work > 0));
        return addr;
 }
@@ -1061,7 +1263,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 static unsigned long unmap_page_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma,
                                unsigned long addr, unsigned long end,
-                                long *zap_work, struct zap_details *details)
+                                struct zap_details *details)
 {
        pgd_t *pgd;
        unsigned long next;
@@ -1075,13 +1277,10 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
-                if (pgd_none_or_clear_bad(pgd)) {
+                if (pgd_none_or_clear_bad(pgd))
-                        (*zap_work)--;
                        continue;
-                }
+                next = zap_pud_range(tlb, vma, pgd, addr, next, details);
-                next = zap_pud_range(tlb, vma, pgd, addr, next,
+        } while (pgd++, addr = next, addr != end);
-                                                zap_work, details);
-        } while (pgd++, addr = next, (addr != end && *zap_work > 0));
        tlb_end_vma(tlb, vma);
        mem_cgroup_uncharge_end();
@@ -1121,17 +1320,12 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
-unsigned long unmap_vmas(struct mmu_gather **tlbp,
+unsigned long unmap_vmas(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr, unsigned long *nr_accounted,
                struct zap_details *details)
 {
-        long zap_work = ZAP_BLOCK_SIZE;
-        unsigned long tlb_start = 0;    /* For tlb_finish_mmu */
-        int tlb_start_valid = 0;
        unsigned long start = start_addr;
-        spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
-        int fullmm = (*tlbp)->fullmm;
        struct mm_struct *mm = vma->vm_mm;
        mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1152,11 +1346,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                        untrack_pfn_vma(vma, 0, 0);
                while (start != end) {
-                        if (!tlb_start_valid) {
-                                tlb_start = start;
-                                tlb_start_valid = 1;
-                        }
                        if (unlikely(is_vm_hugetlb_page(vma))) {
                                /*
                                 * It is undesirable to test vma->vm_file as it
@@ -1169,39 +1358,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                                 * Since no pte has actually been setup, it is
                                 * safe to do nothing in this case.
                                 */
-                                if (vma->vm_file) {
+                                if (vma->vm_file)
                                        unmap_hugepage_range(vma, start, end, NULL);
-                                        zap_work -= (end - start) /
-                                        pages_per_huge_page(hstate_vma(vma));
-                                }
                                start = end;
                        } else
-                                start = unmap_page_range(*tlbp, vma,
+                                start = unmap_page_range(tlb, vma, start, end, details);
-                                                start, end, &zap_work, details);
-                        if (zap_work > 0) {
-                                BUG_ON(start != end);
-                                break;
-                        }
-                        tlb_finish_mmu(*tlbp, tlb_start, start);
-                        if (need_resched() ||
-                                (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
-                                if (i_mmap_lock) {
-                                        *tlbp = NULL;
-                                        goto out;
-                                }
-                                cond_resched();
-                        }
-                        *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
-                        tlb_start_valid = 0;
-                        zap_work = ZAP_BLOCK_SIZE;
                }
        }
-out:
        mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
        return start;   /* which is now the end (or restart) address */
 }
@@ -1217,16 +1382,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
 {
        struct mm_struct *mm = vma->vm_mm;
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        unsigned long end = address + size;
        unsigned long nr_accounted = 0;
        lru_add_drain();
-        tlb = tlb_gather_mmu(mm, 0);
+        tlb_gather_mmu(&tlb, mm, 0);
        update_hiwater_rss(mm);
        end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
-        if (tlb)
+        tlb_finish_mmu(&tlb, address, end);
-                tlb_finish_mmu(tlb, address, end);
        return end;
 }
@@ -2535,96 +2699,11 @@ unwritable_page:
        return ret;
 }
-/*
+static void unmap_mapping_range_vma(struct vm_area_struct *vma,
- * Helper functions for unmap_mapping_range().
- *
- * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
- *
- * We have to restart searching the prio_tree whenever we drop the lock,
- * since the iterator is only valid while the lock is held, and anyway
- * a later vma might be split and reinserted earlier while lock dropped.
- *
- * The list of nonlinear vmas could be handled more efficiently, using
- * a placeholder, but handle it in the same way until a need is shown.
- * It is important to search the prio_tree before nonlinear list: a vma
- * may become nonlinear and be shifted from prio_tree to nonlinear list
- * while the lock is dropped; but never shifted from list to prio_tree.
- *
- * In order to make forward progress despite restarting the search,
- * vm_truncate_count is used to mark a vma as now dealt with, so we can
- * quickly skip it next time around.  Since the prio_tree search only
- * shows us those vmas affected by unmapping the range in question, we
- * can't efficiently keep all vmas in step with mapping->truncate_count:
- * so instead reset them all whenever it wraps back to 0 (then go to 1).
- * mapping->truncate_count and vma->vm_truncate_count are protected by
- * i_mmap_lock.
- *
- * In order to make forward progress despite repeatedly restarting some
- * large vma, note the restart_addr from unmap_vmas when it breaks out:
- * and restart from that address when we reach that vma again.  It might
- * have been split or merged, shrunk or extended, but never shifted: so
- * restart_addr remains valid so long as it remains in the vma's range.
- * unmap_mapping_range forces truncate_count to leap over page-aligned
- * values so we can save vma's restart_addr in its truncate_count field.
- */
-#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
-static void reset_vma_truncate_counts(struct address_space *mapping)
-{
-        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
-                vma->vm_truncate_count = 0;
-        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
-                vma->vm_truncate_count = 0;
-}
-static int unmap_mapping_range_vma(struct vm_area_struct *vma,
                unsigned long start_addr, unsigned long end_addr,
                struct zap_details *details)
 {
-        unsigned long restart_addr;
+        zap_page_range(vma, start_addr, end_addr - start_addr, details);
-        int need_break;
-        /*
-         * files that support invalidating or truncating portions of the
-         * file from under mmaped areas must have their ->fault function
-         * return a locked page (and set VM_FAULT_LOCKED in the return).
-         * This provides synchronisation against concurrent unmapping here.
-         */
-again:
-        restart_addr = vma->vm_truncate_count;
-        if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
-                start_addr = restart_addr;
-                if (start_addr >= end_addr) {
-                        /* Top of vma has been split off since last time */
-                        vma->vm_truncate_count = details->truncate_count;
-                        return 0;
-                }
-        }
-        restart_addr = zap_page_range(vma, start_addr,
-                                        end_addr - start_addr, details);
-        need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
-        if (restart_addr >= end_addr) {
-                /* We have now completed this vma: mark it so */
-                vma->vm_truncate_count = details->truncate_count;
-                if (!need_break)
-                        return 0;
-        } else {
-                /* Note restart_addr in vma's truncate_count field */
-                vma->vm_truncate_count = restart_addr;
-                if (!need_break)
-                        goto again;
-        }
-        spin_unlock(details->i_mmap_lock);
-        cond_resched();
-        spin_lock(details->i_mmap_lock);
-        return -EINTR;
 }
 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
@@ -2634,12 +2713,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
        struct prio_tree_iter iter;
        pgoff_t vba, vea, zba, zea;
-restart:
        vma_prio_tree_foreach(vma, &iter, root,
                        details->first_index, details->last_index) {
-                /* Skip quickly over those we have already dealt with */
-                if (vma->vm_truncate_count == details->truncate_count)
-                        continue;
                vba = vma->vm_pgoff;
                vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
@@ -2651,11 +2726,10 @@ restart:
                if (zea > vea)
                        zea = vea;
-                if (unmap_mapping_range_vma(vma,
+                unmap_mapping_range_vma(vma,
                        ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
                        ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
-                                details) < 0)
+                                details);
-                        goto restart;
        }
 }
@@ -2670,15 +2744,9 @@ static inline void unmap_mapping_range_list(struct list_head *head,
         * across *all* the pages in each nonlinear VMA, not just the pages
         * whose virtual address lies outside the file truncation point.
         */
-restart:
        list_for_each_entry(vma, head, shared.vm_set.list) {
-                /* Skip quickly over those we have already dealt with */
-                if (vma->vm_truncate_count == details->truncate_count)
-                        continue;
                details->nonlinear_vma = vma;
-                if (unmap_mapping_range_vma(vma, vma->vm_start,
+                unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
-                                        vma->vm_end, details) < 0)
-                        goto restart;
        }
 }
@@ -2717,26 +2785,14 @@ void unmap_mapping_range(struct address_space *mapping,
        details.last_index = hba + hlen - 1;
        if (details.last_index < details.first_index)
                details.last_index = ULONG_MAX;
-        details.i_mmap_lock = &mapping->i_mmap_lock;
-        mutex_lock(&mapping->unmap_mutex);
-        spin_lock(&mapping->i_mmap_lock);
-        /* Protect against endless unmapping loops */
-        mapping->truncate_count++;
-        if (unlikely(is_restart_addr(mapping->truncate_count))) {
-                if (mapping->truncate_count == 0)
-                        reset_vma_truncate_counts(mapping);
-                mapping->truncate_count++;
-        }
-        details.truncate_count = mapping->truncate_count;
+        mutex_lock(&mapping->i_mmap_mutex);
        if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
        if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
                unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
-        mutex_unlock(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
@@ -2818,6 +2874,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                /* Had to read the page from swap area: Major fault */
                ret = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
+                mem_cgroup_count_vm_event(mm, PGMAJFAULT);
        } else if (PageHWPoison(page)) {
                /*
                 * hwpoisoned dirty swapcache pages are kept for killing
@@ -2966,7 +3023,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
                if (prev && prev->vm_end == address)
                        return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
-                expand_stack(vma, address - PAGE_SIZE);
+                expand_downwards(vma, address - PAGE_SIZE);
        }
        if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
                struct vm_area_struct *next = vma->vm_next;
@@ -3357,6 +3414,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        __set_current_state(TASK_RUNNING);
        count_vm_event(PGFAULT);
+        mem_cgroup_count_vm_event(mm, PGFAULT);
        /* do counter updates before entering really critical section. */
        check_sync_rss_stat(current);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9ca1d604f7cd..9f646374e32f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -374,10 +374,6 @@ void online_page(struct page *page)
                totalhigh_pages++;
 #endif
-#ifdef CONFIG_FLATMEM
-        max_mapnr = max(pfn, max_mapnr);
-#endif
        ClearPageReserved(page);
        init_page_count(page);
        __free_page(page);
@@ -400,7 +396,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 }
-int online_pages(unsigned long pfn, unsigned long nr_pages)
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
 {
        unsigned long onlined_pages = 0;
        struct zone *zone;
@@ -459,8 +455,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
                zone_pcp_update(zone);
        mutex_unlock(&zonelists_mutex);
-        setup_per_zone_wmarks();
-        calculate_zone_inactive_ratio(zone);
+        init_per_zone_wmark_min();
        if (onlined_pages) {
                kswapd_run(zone_to_nid(zone));
                node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
@@ -705,7 +702,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                if (!pfn_valid(pfn))
                        continue;
                page = pfn_to_page(pfn);
-                if (!page_count(page))
+                if (!get_page_unless_zero(page))
                        continue;
                /*
                 * We can skip free pages. And we can only deal with pages on
@@ -713,6 +710,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                 */
                ret = isolate_lru_page(page);
                if (!ret) { /* Success */
+                        put_page(page);
                        list_add_tail(&page->lru, &source);
                        move_pages--;
                        inc_zone_page_state(page, NR_ISOLATED_ANON +
@@ -724,6 +722,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                               pfn);
                        dump_page(page);
 #endif
+                        put_page(page);
                        /* Because we don't have big zone->lock. we should
                           check this again here. */
                        if (page_count(page)) {
@@ -795,7 +794,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        return offlined;
 }
-static int offline_pages(unsigned long start_pfn,
+static int __ref offline_pages(unsigned long start_pfn,
                  unsigned long end_pfn, unsigned long timeout)
 {
        unsigned long pfn, nr_pages, expire;
@@ -893,8 +892,8 @@ repeat:
        zone->zone_pgdat->node_present_pages -= offlined_pages;
        totalram_pages -= offlined_pages;
-        setup_per_zone_wmarks();
+        init_per_zone_wmark_min();
-        calculate_zone_inactive_ratio(zone);
        if (!node_present_pages(node)) {
                node_clear_state(node, N_HIGH_MEMORY);
                kswapd_stop(node);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 959a8b8c7350..e7fb9d25c54e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -99,7 +99,6 @@
 /* Internal flags */
 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
-#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 static struct kmem_cache *policy_cache;
 static struct kmem_cache *sn_cache;
@@ -457,7 +456,6 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
        },
 };
-static void gather_stats(struct page *, void *, int pte_dirty);
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags);
@@ -492,9 +490,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
                        continue;
-                if (flags & MPOL_MF_STATS)
+                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-                        gather_stats(page, private, pte_dirty(*pte));
-                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                        migrate_page_add(page, private, flags);
                else
                        break;
@@ -1489,7 +1485,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 * freeing by another task.  It is the caller's responsibility to free the
 * extra reference for shared policies.
 */
-static struct mempolicy *get_vma_policy(struct task_struct *task,
+struct mempolicy *get_vma_policy(struct task_struct *task,
                struct vm_area_struct *vma, unsigned long addr)
 {
        struct mempolicy *pol = task->mempolicy;
@@ -2529,159 +2525,3 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
        }
        return p - buffer;
 }
-struct numa_maps {
-        unsigned long pages;
-        unsigned long anon;
-        unsigned long active;
-        unsigned long writeback;
-        unsigned long mapcount_max;
-        unsigned long dirty;
-        unsigned long swapcache;
-        unsigned long node[MAX_NUMNODES];
-};
-static void gather_stats(struct page *page, void *private, int pte_dirty)
-{
-        struct numa_maps *md = private;
-        int count = page_mapcount(page);
-        md->pages++;
-        if (pte_dirty || PageDirty(page))
-                md->dirty++;
-        if (PageSwapCache(page))
-                md->swapcache++;
-        if (PageActive(page) || PageUnevictable(page))
-                md->active++;
-        if (PageWriteback(page))
-                md->writeback++;
-        if (PageAnon(page))
-                md->anon++;
-        if (count > md->mapcount_max)
-                md->mapcount_max = count;
-        md->node[page_to_nid(page)]++;
-}
-#ifdef CONFIG_HUGETLB_PAGE
-static void check_huge_range(struct vm_area_struct *vma,
-                unsigned long start, unsigned long end,
-                struct numa_maps *md)
-{
-        unsigned long addr;
-        struct page *page;
-        struct hstate *h = hstate_vma(vma);
-        unsigned long sz = huge_page_size(h);
-        for (addr = start; addr < end; addr += sz) {
-                pte_t *ptep = huge_pte_offset(vma->vm_mm,
-                                                addr & huge_page_mask(h));
-                pte_t pte;
-                if (!ptep)
-                        continue;
-                pte = *ptep;
-                if (pte_none(pte))
-                        continue;
-                page = pte_page(pte);
-                if (!page)
-                        continue;
-                gather_stats(page, md, pte_dirty(*ptep));
-        }
-}
-#else
-static inline void check_huge_range(struct vm_area_struct *vma,
-                unsigned long start, unsigned long end,
-                struct numa_maps *md)
-{
-}
-#endif
-/*
- * Display pages allocated per node and memory policy via /proc.
- */
-int show_numa_map(struct seq_file *m, void *v)
-{
-        struct proc_maps_private *priv = m->private;
-        struct vm_area_struct *vma = v;
-        struct numa_maps *md;
-        struct file *file = vma->vm_file;
-        struct mm_struct *mm = vma->vm_mm;
-        struct mempolicy *pol;
-        int n;
-        char buffer[50];
-        if (!mm)
-                return 0;
-        md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
-        if (!md)
-                return 0;
-        pol = get_vma_policy(priv->task, vma, vma->vm_start);
-        mpol_to_str(buffer, sizeof(buffer), pol, 0);
-        mpol_cond_put(pol);
-        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
-        if (file) {
-                seq_printf(m, " file=");
-                seq_path(m, &file->f_path, "\n\t= ");
-        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
-                seq_printf(m, " heap");
-        } else if (vma->vm_start <= mm->start_stack &&
-                        vma->vm_end >= mm->start_stack) {
-                seq_printf(m, " stack");
-        }
-        if (is_vm_hugetlb_page(vma)) {
-                check_huge_range(vma, vma->vm_start, vma->vm_end, md);
-                seq_printf(m, " huge");
-        } else {
-                check_pgd_range(vma, vma->vm_start, vma->vm_end,
-                        &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
-        }
-        if (!md->pages)
-                goto out;
-        if (md->anon)
-                seq_printf(m," anon=%lu",md->anon);
-        if (md->dirty)
-                seq_printf(m," dirty=%lu",md->dirty);
-        if (md->pages != md->anon && md->pages != md->dirty)
-                seq_printf(m, " mapped=%lu", md->pages);
-        if (md->mapcount_max > 1)
-                seq_printf(m, " mapmax=%lu", md->mapcount_max);
-        if (md->swapcache)
-                seq_printf(m," swapcache=%lu", md->swapcache);
-        if (md->active < md->pages && !is_vm_hugetlb_page(vma))
-                seq_printf(m," active=%lu", md->active);
-        if (md->writeback)
-                seq_printf(m," writeback=%lu", md->writeback);
-        for_each_node_state(n, N_HIGH_MEMORY)
-                if (md->node[n])
-                        seq_printf(m, " N%d=%lu", n, md->node[n]);
-out:
-        seq_putc(m, '\n');
-        kfree(md);
-        if (m->count < m->size)
-                m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
-        return 0;
-}
diff --git a/mm/migrate.c b/mm/migrate.c
index 34132f8e9109..e4a5c912983d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -721,15 +721,11 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                 * Only page_lock_anon_vma() understands the subtleties of
                 * getting a hold on an anon_vma from outside one of its mms.
                 */
-                anon_vma = page_lock_anon_vma(page);
+                anon_vma = page_get_anon_vma(page);
                if (anon_vma) {
                        /*
-                         * Take a reference count on the anon_vma if the
+                         * Anon page
-                         * page is mapped so that it is guaranteed to
-                         * exist when the page is remapped later
                         */
-                        get_anon_vma(anon_vma);
-                        page_unlock_anon_vma(anon_vma);
                } else if (PageSwapCache(page)) {
                        /*
                         * We cannot be sure that the anon_vma of an unmapped
@@ -857,13 +853,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
                lock_page(hpage);
        }
-        if (PageAnon(hpage)) {
+        if (PageAnon(hpage))
-                anon_vma = page_lock_anon_vma(hpage);
+                anon_vma = page_get_anon_vma(hpage);
-                if (anon_vma) {
-                        get_anon_vma(anon_vma);
-                        page_unlock_anon_vma(anon_vma);
-                }
-        }
        try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
diff --git a/mm/mlock.c b/mm/mlock.c
index 516b2c2ddd5a..048260c4e02e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -307,13 +307,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
 * For vmas that pass the filters, merge/split as appropriate.
 */
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
-        unsigned long start, unsigned long end, unsigned int newflags)
+        unsigned long start, unsigned long end, vm_flags_t newflags)
 {
        struct mm_struct *mm = vma->vm_mm;
        pgoff_t pgoff;
        int nr_pages;
        int ret = 0;
-        int lock = newflags & VM_LOCKED;
+        int lock = !!(newflags & VM_LOCKED);
        if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
            is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
@@ -385,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
                prev = vma;
        for (nstart = start ; ; ) {
-                unsigned int newflags;
+                vm_flags_t newflags;
                /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
@@ -524,7 +524,7 @@ static int do_mlockall(int flags)
                goto out;
        for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
-                unsigned int newflags;
+                vm_flags_t newflags;
                newflags = vma->vm_flags | VM_LOCKED;
                if (!(flags & MCL_CURRENT))
diff --git a/mm/mmap.c b/mm/mmap.c
index 772140c53ab1..bbdc9af5e117 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -84,10 +84,14 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags)
 }
 EXPORT_SYMBOL(vm_get_page_prot);
-int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
+int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
-int sysctl_overcommit_ratio = 50;       /* default is 50% */
+int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-struct percpu_counter vm_committed_as;
+/*
+ * Make sure vm_committed_as in one cacheline and not cacheline shared with
+ * other variables. It can be updated by several CPUs frequently.
+ */
+struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
 /*
 * Check that a process has enough memory to allocate a new virtual
@@ -190,7 +194,7 @@ error:
 }
 /*
- * Requires inode->i_mapping->i_mmap_lock
+ * Requires inode->i_mapping->i_mmap_mutex
 */
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                struct file *file, struct address_space *mapping)
@@ -218,9 +222,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
        if (file) {
                struct address_space *mapping = file->f_mapping;
-                spin_lock(&mapping->i_mmap_lock);
+                mutex_lock(&mapping->i_mmap_mutex);
                __remove_shared_vm_struct(vma, file, mapping);
-                spin_unlock(&mapping->i_mmap_lock);
+                mutex_unlock(&mapping->i_mmap_mutex);
        }
 }
@@ -394,29 +398,6 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
        return vma;
 }
-static inline void
-__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
-                struct vm_area_struct *prev, struct rb_node *rb_parent)
-{
-        struct vm_area_struct *next;
-        vma->vm_prev = prev;
-        if (prev) {
-                next = prev->vm_next;
-                prev->vm_next = vma;
-        } else {
-                mm->mmap = vma;
-                if (rb_parent)
-                        next = rb_entry(rb_parent,
-                                        struct vm_area_struct, vm_rb);
-                else
-                        next = NULL;
-        }
-        vma->vm_next = next;
-        if (next)
-                next->vm_prev = vma;
-}
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
                struct rb_node **rb_link, struct rb_node *rb_parent)
 {
@@ -464,16 +445,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
        if (vma->vm_file)
                mapping = vma->vm_file->f_mapping;
-        if (mapping) {
+        if (mapping)
-                spin_lock(&mapping->i_mmap_lock);
+                mutex_lock(&mapping->i_mmap_mutex);
-                vma->vm_truncate_count = mapping->truncate_count;
-        }
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        __vma_link_file(vma);
        if (mapping)
-                spin_unlock(&mapping->i_mmap_lock);
+                mutex_unlock(&mapping->i_mmap_mutex);
        mm->map_count++;
        validate_mm(mm);
@@ -576,17 +555,8 @@ again:			remove_next = 1 + (end > next->vm_end);
                mapping = file->f_mapping;
                if (!(vma->vm_flags & VM_NONLINEAR))
                        root = &mapping->i_mmap;
-                spin_lock(&mapping->i_mmap_lock);
+                mutex_lock(&mapping->i_mmap_mutex);
-                if (importer &&
-                    vma->vm_truncate_count != next->vm_truncate_count) {
-                        /*
-                         * unmap_mapping_range might be in progress:
-                         * ensure that the expanding vma is rescanned.
-                         */
-                        importer->vm_truncate_count = 0;
-                }
                if (insert) {
-                        insert->vm_truncate_count = vma->vm_truncate_count;
                        /*
                         * Put into prio_tree now, so instantiated pages
                         * are visible to arm/parisc __flush_dcache_page
@@ -605,7 +575,7 @@ again:			remove_next = 1 + (end > next->vm_end);
         * lock may be shared between many sibling processes.  Skipping
         * the lock for brk adjustments makes a difference sometimes.
         */
-        if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
+        if (vma->anon_vma && (importer || start != vma->vm_start)) {
                anon_vma = vma->anon_vma;
                anon_vma_lock(anon_vma);
        }
@@ -652,7 +622,7 @@ again:			remove_next = 1 + (end > next->vm_end);
        if (anon_vma)
                anon_vma_unlock(anon_vma);
        if (mapping)
-                spin_unlock(&mapping->i_mmap_lock);
+                mutex_unlock(&mapping->i_mmap_mutex);
        if (remove_next) {
                if (file) {
@@ -699,9 +669,17 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
 }
 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
-                                        struct anon_vma *anon_vma2)
+                                        struct anon_vma *anon_vma2,
+                                        struct vm_area_struct *vma)
 {
-        return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
+        /*
+         * The list_is_singular() test is to avoid merging VMA cloned from
+         * parents. This can improve scalability caused by anon_vma lock.
+         */
+        if ((!anon_vma1 || !anon_vma2) && (!vma ||
+                list_is_singular(&vma->anon_vma_chain)))
+                return 1;
+        return anon_vma1 == anon_vma2;
 }
 /*
@@ -720,7 +698,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
        struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 {
        if (is_mergeable_vma(vma, file, vm_flags) &&
-            is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                if (vma->vm_pgoff == vm_pgoff)
                        return 1;
        }
@@ -739,7 +717,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
        struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 {
        if (is_mergeable_vma(vma, file, vm_flags) &&
-            is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                pgoff_t vm_pglen;
                vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
                if (vma->vm_pgoff + vm_pglen == vm_pgoff)
@@ -817,7 +795,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                can_vma_merge_before(next, vm_flags,
                                        anon_vma, file, pgoff+pglen) &&
                                is_mergeable_anon_vma(prev->anon_vma,
-                                                      next->anon_vma)) {
+                                                      next->anon_vma, NULL)) {
                                                        /* cases 1, 6 */
                        err = vma_adjust(prev, prev->vm_start,
                                next->vm_end, prev->vm_pgoff, NULL);
@@ -982,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 {
        struct mm_struct * mm = current->mm;
        struct inode *inode;
-        unsigned int vm_flags;
+        vm_flags_t vm_flags;
        int error;
        unsigned long reqprot = prot;
@@ -1187,7 +1165,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
 */
 int vma_wants_writenotify(struct vm_area_struct *vma)
 {
-        unsigned int vm_flags = vma->vm_flags;
+        vm_flags_t vm_flags = vma->vm_flags;
        /* If it was private or non-writable, the write bit is already clear */
        if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
@@ -1215,7 +1193,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
 * We account for memory if it's a private writeable mapping,
 * not hugepages and VM_NORESERVE wasn't set.
 */
-static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 {
        /*
         * hugetlb has its own accounting separate from the core VM
@@ -1229,7 +1207,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
 unsigned long mmap_region(struct file *file, unsigned long addr,
                          unsigned long len, unsigned long flags,
-                          unsigned int vm_flags, unsigned long pgoff)
+                          vm_flags_t vm_flags, unsigned long pgoff)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
@@ -1785,7 +1763,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 /*
 * vma is the first one with address < vma->vm_start.  Have to extend vma.
 */
-static int expand_downwards(struct vm_area_struct *vma,
+int expand_downwards(struct vm_area_struct *vma,
                                   unsigned long address)
 {
        int error;
@@ -1832,11 +1810,6 @@ static int expand_downwards(struct vm_area_struct *vma,
        return error;
 }
-int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
-{
-        return expand_downwards(vma, address);
-}
 #ifdef CONFIG_STACK_GROWSUP
 int expand_stack(struct vm_area_struct *vma, unsigned long address)
 {
@@ -1919,17 +1892,17 @@ static void unmap_region(struct mm_struct *mm,
                unsigned long start, unsigned long end)
 {
        struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        unsigned long nr_accounted = 0;
        lru_add_drain();
-        tlb = tlb_gather_mmu(mm, 0);
+        tlb_gather_mmu(&tlb, mm, 0);
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+        free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
-                                 next? next->vm_start: 0);
+                                 next ? next->vm_start : 0);
-        tlb_finish_mmu(tlb, start, end);
+        tlb_finish_mmu(&tlb, start, end);
 }
 /*
@@ -2271,7 +2244,7 @@ EXPORT_SYMBOL(do_brk);
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct *mm)
 {
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;
        unsigned long end;
@@ -2296,14 +2269,14 @@ void exit_mmap(struct mm_struct *mm)
        lru_add_drain();
        flush_cache_mm(mm);
-        tlb = tlb_gather_mmu(mm, 1);
+        tlb_gather_mmu(&tlb, mm, 1);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
+        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
-        tlb_finish_mmu(tlb, 0, end);
+        tlb_finish_mmu(&tlb, 0, end);
        /*
         * Walk the list again, actually closing and freeing it,
@@ -2317,7 +2290,7 @@ void exit_mmap(struct mm_struct *mm)
 /* Insert vm structure into process list sorted by address
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
- * then i_mmap_lock is taken here.
+ * then i_mmap_mutex is taken here.
 */
 int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
@@ -2529,15 +2502,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-                spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
+                mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
                /*
                 * We can safely modify head.next after taking the
-                 * anon_vma->root->lock. If some other vma in this mm shares
+                 * anon_vma->root->mutex. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
-                 * anon_vma->root->lock.
+                 * anon_vma->root->mutex.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
                                       &anon_vma->root->head.next))
@@ -2559,7 +2532,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
                 */
                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
                        BUG();
-                spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+                mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
        }
 }
@@ -2586,7 +2559,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 * vma in this mm is backed by the same anon_vma or address_space.
 *
 * We can take all the locks in random order because the VM code
- * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never
 * takes more than one of them in a row. Secondly we're protected
 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
 *
@@ -2642,7 +2615,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
-                 * anon_vma->root->lock.
+                 * anon_vma->root->mutex.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->head.next))
@@ -2658,7 +2631,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
                 * AS_MM_ALL_LOCKS can't change to 0 from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-                spin_unlock(&mapping->i_mmap_lock);
+                mutex_unlock(&mapping->i_mmap_mutex);
                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
                                        &mapping->flags))
                        BUG();
diff --git a/mm/mremap.c b/mm/mremap.c
index a7c1f9f9b941..506fa44403df 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -93,8 +93,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                 * and we propagate stale pages into the dst afterward.
                 */
                mapping = vma->vm_file->f_mapping;
-                spin_lock(&mapping->i_mmap_lock);
+                mutex_lock(&mapping->i_mmap_mutex);
-                new_vma->vm_truncate_count = 0;
        }
        /*
@@ -123,7 +122,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        pte_unmap(new_pte - 1);
        pte_unmap_unlock(old_pte - 1, old_ptl);
        if (mapping)
-                spin_unlock(&mapping->i_mmap_lock);
+                mutex_unlock(&mapping->i_mmap_mutex);
        mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 9109049f0bbc..6e93dc7f2586 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -307,30 +307,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
                                   unsigned long align, unsigned long goal)
 {
-#ifdef MAX_DMA32_PFN
-        unsigned long end_pfn;
-        if (WARN_ON_ONCE(slab_is_available()))
-                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-        /* update goal according ...MAX_DMA32_PFN */
-        end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
-        if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
-            (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
-                void *ptr;
-                unsigned long new_goal;
-                new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
-                ptr =  __alloc_memory_core_early(pgdat->node_id, size, align,
-                                                 new_goal, -1ULL);
-                if (ptr)
-                        return ptr;
-        }
-#endif
        return __alloc_bootmem_node(pgdat, size, align, goal);
 }
 #ifdef CONFIG_SPARSEMEM
diff --git a/mm/nommu.c b/mm/nommu.c
index c4c542c736a9..1fd0c51b10a6 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -680,9 +680,9 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
 */
 static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-        struct vm_area_struct *pvma, **pp, *next;
+        struct vm_area_struct *pvma, *prev;
        struct address_space *mapping;
-        struct rb_node **p, *parent;
+        struct rb_node **p, *parent, *rb_prev;
        kenter(",%p", vma);
@@ -703,7 +703,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
        }
        /* add the VMA to the tree */
-        parent = NULL;
+        parent = rb_prev = NULL;
        p = &mm->mm_rb.rb_node;
        while (*p) {
                parent = *p;
@@ -713,17 +713,20 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
                 * (the latter is necessary as we may get identical VMAs) */
                if (vma->vm_start < pvma->vm_start)
                        p = &(*p)->rb_left;
-                else if (vma->vm_start > pvma->vm_start)
+                else if (vma->vm_start > pvma->vm_start) {
+                        rb_prev = parent;
                        p = &(*p)->rb_right;
-                else if (vma->vm_end < pvma->vm_end)
+                } else if (vma->vm_end < pvma->vm_end)
                        p = &(*p)->rb_left;
-                else if (vma->vm_end > pvma->vm_end)
+                else if (vma->vm_end > pvma->vm_end) {
+                        rb_prev = parent;
                        p = &(*p)->rb_right;
-                else if (vma < pvma)
+                } else if (vma < pvma)
                        p = &(*p)->rb_left;
-                else if (vma > pvma)
+                else if (vma > pvma) {
+                        rb_prev = parent;
                        p = &(*p)->rb_right;
-                else
+                } else
                        BUG();
        }
@@ -731,20 +734,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
        rb_insert_color(&vma->vm_rb, &mm->mm_rb);
        /* add VMA to the VMA list also */
-        for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) {
+        prev = NULL;
-                if (pvma->vm_start > vma->vm_start)
+        if (rb_prev)
-                        break;
+                prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
-                if (pvma->vm_start < vma->vm_start)
-                        continue;
-                if (pvma->vm_end < vma->vm_end)
-                        break;
-        }
-        next = *pp;
+        __vma_link_list(mm, vma, prev, parent);
-        *pp = vma;
-        vma->vm_next = next;
-        if (next)
-                next->vm_prev = vma;
 }
 /*
@@ -752,7 +746,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 */
 static void delete_vma_from_mm(struct vm_area_struct *vma)
 {
-        struct vm_area_struct **pp;
        struct address_space *mapping;
        struct mm_struct *mm = vma->vm_mm;
@@ -775,12 +768,14 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
        /* remove from the MM's tree and list */
        rb_erase(&vma->vm_rb, &mm->mm_rb);
-        for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) {
-                if (*pp == vma) {
+        if (vma->vm_prev)
-                        *pp = vma->vm_next;
+                vma->vm_prev->vm_next = vma->vm_next;
-                        break;
+        else
-                }
+                mm->mmap = vma->vm_next;
-        }
+        if (vma->vm_next)
+                vma->vm_next->vm_prev = vma->vm_prev;
        vma->vm_mm = NULL;
 }
@@ -809,17 +804,15 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
        struct vm_area_struct *vma;
-        struct rb_node *n = mm->mm_rb.rb_node;
        /* check the cache first */
        vma = mm->mmap_cache;
        if (vma && vma->vm_start <= addr && vma->vm_end > addr)
                return vma;
-        /* trawl the tree (there may be multiple mappings in which addr
+        /* trawl the list (there may be multiple mappings in which addr
         * resides) */
-        for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
-                vma = rb_entry(n, struct vm_area_struct, vm_rb);
                if (vma->vm_start > addr)
                        return NULL;
                if (vma->vm_end > addr) {
@@ -859,7 +852,6 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
                                             unsigned long len)
 {
        struct vm_area_struct *vma;
-        struct rb_node *n = mm->mm_rb.rb_node;
        unsigned long end = addr + len;
        /* check the cache first */
@@ -867,10 +859,9 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
        if (vma && vma->vm_start == addr && vma->vm_end == end)
                return vma;
-        /* trawl the tree (there may be multiple mappings in which addr
+        /* trawl the list (there may be multiple mappings in which addr
         * resides) */
-        for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
-                vma = rb_entry(n, struct vm_area_struct, vm_rb);
                if (vma->vm_start < addr)
                        continue;
                if (vma->vm_start > addr)
@@ -1133,7 +1124,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
                           unsigned long capabilities)
 {
        struct page *pages;
-        unsigned long total, point, n, rlen;
+        unsigned long total, point, n;
        void *base;
        int ret, order;
@@ -1157,13 +1148,12 @@ static int do_mmap_private(struct vm_area_struct *vma,
                 * make a private copy of the data and map that instead */
        }
-        rlen = PAGE_ALIGN(len);
        /* allocate some memory to hold the mapping
         * - note that this may not return a page-aligned address if the object
         *   we're allocating is smaller than a page
         */
-        order = get_order(rlen);
+        order = get_order(len);
        kdebug("alloc order %d for %lx", order, len);
        pages = alloc_pages(GFP_KERNEL, order);
@@ -1173,7 +1163,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
        total = 1 << order;
        atomic_long_add(total, &mmap_pages_allocated);
-        point = rlen >> PAGE_SHIFT;
+        point = len >> PAGE_SHIFT;
        /* we allocated a power-of-2 sized page set, so we may want to trim off
         * the excess */
@@ -1195,7 +1185,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
        base = page_address(pages);
        region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
        region->vm_start = (unsigned long) base;
-        region->vm_end   = region->vm_start + rlen;
+        region->vm_end   = region->vm_start + len;
        region->vm_top   = region->vm_start + (total << PAGE_SHIFT);
        vma->vm_start = region->vm_start;
@@ -1211,22 +1201,22 @@ static int do_mmap_private(struct vm_area_struct *vma,
                old_fs = get_fs();
                set_fs(KERNEL_DS);
-                ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos);
+                ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
                set_fs(old_fs);
                if (ret < 0)
                        goto error_free;
                /* clear the last little bit */
-                if (ret < rlen)
+                if (ret < len)
-                        memset(base + ret, 0, rlen - ret);
+                        memset(base + ret, 0, len - ret);
        }
        return 0;
 error_free:
-        free_page_series(region->vm_start, region->vm_end);
+        free_page_series(region->vm_start, region->vm_top);
        region->vm_start = vma->vm_start = 0;
        region->vm_end   = vma->vm_end = 0;
        region->vm_top   = 0;
@@ -1235,7 +1225,7 @@ error_free:
 enomem:
        printk("Allocation of length %lu from process %d (%s) failed\n",
               len, current->pid, current->comm);
-        show_free_areas();
+        show_free_areas(0);
        return -ENOMEM;
 }
@@ -1268,6 +1258,7 @@ unsigned long do_mmap_pgoff(struct file *file,
        /* we ignore the address hint */
        addr = 0;
+        len = PAGE_ALIGN(len);
        /* we've determined that we can make the mapping, now translate what we
         * now know into VMA flags */
@@ -1385,15 +1376,15 @@ unsigned long do_mmap_pgoff(struct file *file,
                if (capabilities & BDI_CAP_MAP_DIRECT) {
                        addr = file->f_op->get_unmapped_area(file, addr, len,
                                                             pgoff, flags);
-                        if (IS_ERR((void *) addr)) {
+                        if (IS_ERR_VALUE(addr)) {
                                ret = addr;
-                                if (ret != (unsigned long) -ENOSYS)
+                                if (ret != -ENOSYS)
                                        goto error_just_free;
                                /* the driver refused to tell us where to site
                                 * the mapping so we'll have to attempt to copy
                                 * it */
-                                ret = (unsigned long) -ENODEV;
+                                ret = -ENODEV;
                                if (!(capabilities & BDI_CAP_MAP_COPY))
                                        goto error_just_free;
@@ -1468,14 +1459,14 @@ error_getting_vma:
        printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
               " from process %d failed\n",
               len, current->pid);
-        show_free_areas();
+        show_free_areas(0);
        return -ENOMEM;
 error_getting_region:
        printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
               " from process %d failed\n",
               len, current->pid);
-        show_free_areas();
+        show_free_areas(0);
        return -ENOMEM;
 }
 EXPORT_SYMBOL(do_mmap_pgoff);
@@ -1644,15 +1635,17 @@ static int shrink_vma(struct mm_struct *mm,
 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 {
        struct vm_area_struct *vma;
-        struct rb_node *rb;
+        unsigned long end;
-        unsigned long end = start + len;
        int ret;
        kenter(",%lx,%zx", start, len);
+        len = PAGE_ALIGN(len);
        if (len == 0)
                return -EINVAL;
+        end = start + len;
        /* find the first potentially overlapping VMA */
        vma = find_vma(mm, start);
        if (!vma) {
@@ -1677,9 +1670,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
                        }
                        if (end == vma->vm_end)
                                goto erase_whole_vma;
-                        rb = rb_next(&vma->vm_rb);
+                        vma = vma->vm_next;
-                        vma = rb_entry(rb, struct vm_area_struct, vm_rb);
+                } while (vma);
-                } while (rb);
                kleave(" = -EINVAL [split file]");
                return -EINVAL;
        } else {
@@ -1773,6 +1765,8 @@ unsigned long do_mremap(unsigned long addr,
        struct vm_area_struct *vma;
        /* insanity checks first */
+        old_len = PAGE_ALIGN(old_len);
+        new_len = PAGE_ALIGN(new_len);
        if (old_len == 0 || new_len == 0)
                return (unsigned long) -EINVAL;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f52e85c80e8d..e4b0991ca351 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -38,6 +38,33 @@ int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
 static DEFINE_SPINLOCK(zone_scan_lock);
+/**
+ * test_set_oom_score_adj() - set current's oom_score_adj and return old value
+ * @new_val: new oom_score_adj value
+ *
+ * Sets the oom_score_adj value for current to @new_val with proper
+ * synchronization and returns the old value.  Usually used to temporarily
+ * set a value, save the old value in the caller, and then reinstate it later.
+ */
+int test_set_oom_score_adj(int new_val)
+{
+        struct sighand_struct *sighand = current->sighand;
+        int old_val;
+        spin_lock_irq(&sighand->siglock);
+        old_val = current->signal->oom_score_adj;
+        if (new_val != old_val) {
+                if (new_val == OOM_SCORE_ADJ_MIN)
+                        atomic_inc(&current->mm->oom_disable_count);
+                else if (old_val == OOM_SCORE_ADJ_MIN)
+                        atomic_dec(&current->mm->oom_disable_count);
+                current->signal->oom_score_adj = new_val;
+        }
+        spin_unlock_irq(&sighand->siglock);
+        return old_val;
+}
 #ifdef CONFIG_NUMA
 /**
 * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -155,15 +182,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
        }
        /*
-         * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
-         * priority for oom killing.
-         */
-        if (p->flags & PF_OOM_ORIGIN) {
-                task_unlock(p);
-                return 1000;
-        }
-        /*
         * The memory controller may have a limit of 0 bytes, so avoid a divide
         * by zero, if necessary.
         */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3f8bce264df6..a4e1db3f1981 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -30,6 +30,7 @@
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
@@ -39,6 +40,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
+#include <linux/vmstat.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
@@ -54,6 +56,7 @@
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
+#include <linux/prefetch.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1734,6 +1737,45 @@ static inline bool should_suppress_show_mem(void)
        return ret;
 }
+static DEFINE_RATELIMIT_STATE(nopage_rs,
+                DEFAULT_RATELIMIT_INTERVAL,
+                DEFAULT_RATELIMIT_BURST);
+void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+{
+        va_list args;
+        unsigned int filter = SHOW_MEM_FILTER_NODES;
+        if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+                return;
+        /*
+         * This documents exceptions given to allocations in certain
+         * contexts that are allowed to allocate outside current's set
+         * of allowed nodes.
+         */
+        if (!(gfp_mask & __GFP_NOMEMALLOC))
+                if (test_thread_flag(TIF_MEMDIE) ||
+                    (current->flags & (PF_MEMALLOC | PF_EXITING)))
+                        filter &= ~SHOW_MEM_FILTER_NODES;
+        if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+                filter &= ~SHOW_MEM_FILTER_NODES;
+        if (fmt) {
+                printk(KERN_WARNING);
+                va_start(args, fmt);
+                vprintk(fmt, args);
+                va_end(args);
+        }
+        pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
+                   current->comm, order, gfp_mask);
+        dump_stack();
+        if (!should_suppress_show_mem())
+                show_mem(filter);
+}
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
                                unsigned long pages_reclaimed)
@@ -2064,6 +2106,7 @@ restart:
                first_zones_zonelist(zonelist, high_zoneidx, NULL,
                                        &preferred_zone);
+rebalance:
        /* This is the last chance, in general, before the goto nopage. */
        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                        high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2071,7 +2114,6 @@ restart:
        if (page)
                goto got_pg;
-rebalance:
        /* Allocate without watermarks if the context allows */
        if (alloc_flags & ALLOC_NO_WATERMARKS) {
                page = __alloc_pages_high_priority(gfp_mask, order,
@@ -2105,7 +2147,7 @@ rebalance:
                                        sync_migration);
        if (page)
                goto got_pg;
-        sync_migration = !(gfp_mask & __GFP_NO_KSWAPD);
+        sync_migration = true;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2176,27 +2218,7 @@ rebalance:
        }
 nopage:
-        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
+        warn_alloc_failed(gfp_mask, order, NULL);
-                unsigned int filter = SHOW_MEM_FILTER_NODES;
-                /*
-                 * This documents exceptions given to allocations in certain
-                 * contexts that are allowed to allocate outside current's set
-                 * of allowed nodes.
-                 */
-                if (!(gfp_mask & __GFP_NOMEMALLOC))
-                        if (test_thread_flag(TIF_MEMDIE) ||
-                            (current->flags & (PF_MEMALLOC | PF_EXITING)))
-                                filter &= ~SHOW_MEM_FILTER_NODES;
-                if (in_interrupt() || !wait)
-                        filter &= ~SHOW_MEM_FILTER_NODES;
-                pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n",
-                        current->comm, order, gfp_mask);
-                dump_stack();
-                if (!should_suppress_show_mem())
-                        show_mem(filter);
-        }
        return page;
 got_pg:
        if (kmemcheck_enabled)
@@ -2225,6 +2247,10 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (should_fail_alloc_page(gfp_mask, order))
                return NULL;
+#ifndef CONFIG_ZONE_DMA
+        if (WARN_ON_ONCE(gfp_mask & __GFP_DMA))
+                return NULL;
+#endif
        /*
         * Check the zones suitable for the gfp_mask contain at least one
@@ -2472,10 +2498,10 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 #endif
 /*
- * Determine whether the zone's node should be displayed or not, depending on
+ * Determine whether the node should be displayed or not, depending on whether
- * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas().
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
 */
-static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone)
+bool skip_free_areas_node(unsigned int flags, int nid)
 {
        bool ret = false;
@@ -2483,8 +2509,7 @@ static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone)
                goto out;
        get_mems_allowed();
-        ret = !node_isset(zone->zone_pgdat->node_id,
+        ret = !node_isset(nid, cpuset_current_mems_allowed);
-                                cpuset_current_mems_allowed);
        put_mems_allowed();
 out:
        return ret;
@@ -2499,13 +2524,13 @@ out:
 * Suppresses nodes that are not allowed by current's cpuset if
 * SHOW_MEM_FILTER_NODES is passed.
 */
-void __show_free_areas(unsigned int filter)
+void show_free_areas(unsigned int filter)
 {
        int cpu;
        struct zone *zone;
        for_each_populated_zone(zone) {
-                if (skip_free_areas_zone(filter, zone))
+                if (skip_free_areas_node(filter, zone_to_nid(zone)))
                        continue;
                show_node(zone);
                printk("%s per-cpu:\n", zone->name);
@@ -2548,7 +2573,7 @@ void __show_free_areas(unsigned int filter)
        for_each_populated_zone(zone) {
                int i;
-                if (skip_free_areas_zone(filter, zone))
+                if (skip_free_areas_node(filter, zone_to_nid(zone)))
                        continue;
                show_node(zone);
                printk("%s"
@@ -2617,7 +2642,7 @@ void __show_free_areas(unsigned int filter)
        for_each_populated_zone(zone) {
                unsigned long nr[MAX_ORDER], flags, order, total = 0;
-                if (skip_free_areas_zone(filter, zone))
+                if (skip_free_areas_node(filter, zone_to_nid(zone)))
                        continue;
                show_node(zone);
                printk("%s: ", zone->name);
@@ -2638,11 +2663,6 @@ void __show_free_areas(unsigned int filter)
        show_swap_cache_info();
 }
-void show_free_areas(void)
-{
-        __show_free_areas(0);
-}
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
        zoneref->zone = zone;
@@ -3313,6 +3333,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 /*
+ * Check if a pageblock contains reserved pages
+ */
+static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long pfn;
+        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+                if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
+                        return 1;
+        }
+        return 0;
+}
+/*
 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
 * of blocks reserved is based on min_wmark_pages(zone). The memory within
 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
@@ -3321,7 +3355,7 @@ static inline unsigned long wait_table_bits(unsigned long size)
 */
 static void setup_zone_migrate_reserve(struct zone *zone)
 {
-        unsigned long start_pfn, pfn, end_pfn;
+        unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
        struct page *page;
        unsigned long block_migratetype;
        int reserve;
@@ -3351,7 +3385,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                        continue;
                /* Blocks with reserved pages will never free, skip them. */
-                if (PageReserved(page))
+                block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+                if (pageblock_is_reserved(pfn, block_end_pfn))
                        continue;
                block_migratetype = get_pageblock_migratetype(page);
@@ -3540,7 +3575,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                pcp->batch = PAGE_SHIFT * 8;
 }
-static __meminit void setup_zone_pageset(struct zone *zone)
+static void setup_zone_pageset(struct zone *zone)
 {
        int cpu;
@@ -4288,10 +4323,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->zone_pgdat = pgdat;
                zone_pcp_init(zone);
-                for_each_lru(l) {
+                for_each_lru(l)
                        INIT_LIST_HEAD(&zone->lru[l].list);
-                        zone->reclaim_stat.nr_saved_scan[l] = 0;
-                }
                zone->reclaim_stat.recent_rotated[0] = 0;
                zone->reclaim_stat.recent_rotated[1] = 0;
                zone->reclaim_stat.recent_scanned[0] = 0;
@@ -5099,7 +5132,7 @@ void setup_per_zone_wmarks(void)
 *    1TB     101        10GB
 *   10TB     320        32GB
 */
-void calculate_zone_inactive_ratio(struct zone *zone)
+static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
 {
        unsigned int gb, ratio;
@@ -5113,7 +5146,7 @@ void calculate_zone_inactive_ratio(struct zone *zone)
        zone->inactive_ratio = ratio;
 }
-static void __init setup_per_zone_inactive_ratio(void)
+static void __meminit setup_per_zone_inactive_ratio(void)
 {
        struct zone *zone;
@@ -5145,7 +5178,7 @@ static void __init setup_per_zone_inactive_ratio(void)
 * 8192MB:      11584k
 * 16384MB:     16384k
 */
-static int __init init_per_zone_wmark_min(void)
+int __meminit init_per_zone_wmark_min(void)
 {
        unsigned long lowmem_kbytes;
@@ -5157,6 +5190,7 @@ static int __init init_per_zone_wmark_min(void)
        if (min_free_kbytes > 65536)
                min_free_kbytes = 65536;
        setup_per_zone_wmarks();
+        refresh_zone_stat_thresholds();
        setup_per_zone_lowmem_reserve();
        setup_per_zone_inactive_ratio();
        return 0;
@@ -5507,10 +5541,8 @@ int set_migratetype_isolate(struct page *page)
        struct memory_isolate_notify arg;
        int notifier_ret;
        int ret = -EBUSY;
-        int zone_idx;
        zone = page_zone(page);
-        zone_idx = zone_idx(zone);
        spin_lock_irqsave(&zone->lock, flags);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 2daadc322ba6..74ccff61d1be 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -130,7 +130,7 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc)
        return page;
 }
-static void *__init_refok alloc_page_cgroup(size_t size, int nid)
+static void *__meminit alloc_page_cgroup(size_t size, int nid)
 {
        void *addr = NULL;
@@ -162,7 +162,7 @@ static void free_page_cgroup(void *addr)
 }
 #endif
-static int __init_refok init_section_page_cgroup(unsigned long pfn)
+static int __meminit init_section_page_cgroup(unsigned long pfn)
 {
        struct page_cgroup *base, *pc;
        struct mem_section *section;
@@ -475,7 +475,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
        if (!do_swap_account)
                return 0;
-        length = ((max_pages/SC_PER_PAGE) + 1);
+        length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
        array_size = length * sizeof(void *);
        array = vmalloc(array_size);
@@ -492,8 +492,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
                /* memory shortage */
                ctrl->map = NULL;
                ctrl->length = 0;
-                vfree(array);
                mutex_unlock(&swap_cgroup_mutex);
+                vfree(array);
                goto nomem;
        }
        mutex_unlock(&swap_cgroup_mutex);
@@ -508,7 +508,8 @@ nomem:
 void swap_cgroup_swapoff(int type)
 {
-        int i;
+        struct page **map;
+        unsigned long i, length;
        struct swap_cgroup_ctrl *ctrl;
        if (!do_swap_account)
@@ -516,17 +517,20 @@ void swap_cgroup_swapoff(int type)
        mutex_lock(&swap_cgroup_mutex);
        ctrl = &swap_cgroup_ctrl[type];
-        if (ctrl->map) {
+        map = ctrl->map;
-                for (i = 0; i < ctrl->length; i++) {
+        length = ctrl->length;
-                        struct page *page = ctrl->map[i];
+        ctrl->map = NULL;
+        ctrl->length = 0;
+        mutex_unlock(&swap_cgroup_mutex);
+        if (map) {
+                for (i = 0; i < length; i++) {
+                        struct page *page = map[i];
                        if (page)
                                __free_page(page);
                }
-                vfree(ctrl->map);
+                vfree(map);
-                ctrl->map = NULL;
-                ctrl->length = 0;
        }
-        mutex_unlock(&swap_cgroup_mutex);
 }
 #endif
diff --git a/mm/percpu.c b/mm/percpu.c
index a160db39b810..bf80e55dbed7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1215,8 +1215,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
 #ifdef CONFIG_SMP
        PCPU_SETUP_BUG_ON(!ai->static_size);
+        PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
 #endif
        PCPU_SETUP_BUG_ON(!base_addr);
+        PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
        PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
        PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
@@ -1645,8 +1647,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
        /* warn if maximum distance is further than 75% of vmalloc space */
        if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
                pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
-                           "space 0x%lx\n",
+                           "space 0x%lx\n", max_distance,
-                           max_distance, VMALLOC_END - VMALLOC_START);
+                           (unsigned long)(VMALLOC_END - VMALLOC_START));
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
                /* and fail if we have fallback */
                rc = -EINVAL;
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
index 603ae98d9694..799dcfd7cd8c 100644
--- a/mm/prio_tree.c
+++ b/mm/prio_tree.c
@@ -13,6 +13,7 @@
 #include <linux/mm.h>
 #include <linux/prio_tree.h>
+#include <linux/prefetch.h>
 /*
 * See lib/prio_tree.c for details on the general radix priority search tree
diff --git a/mm/readahead.c b/mm/readahead.c
index 2c0cc489e288..867f9dd82dcd 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -180,7 +180,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
                if (page)
                        continue;
-                page = page_cache_alloc_cold(mapping);
+                page = page_cache_alloc_readahead(mapping);
                if (!page)
                        break;
                page->index = page_offset;
diff --git a/mm/rmap.c b/mm/rmap.c
index 8da044a1db0f..3a39b518a653 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,8 +24,8 @@
 *   inode->i_alloc_sem (vmtruncate_range)
 *   mm->mmap_sem
 *     page->flags PG_locked (lock_page)
- *       mapping->i_mmap_lock
+ *       mapping->i_mmap_mutex
- *         anon_vma->lock
+ *         anon_vma->mutex
 *           mm->page_table_lock or pte_lock
 *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
 *             swap_lock (in swap_duplicate, swap_info_get)
@@ -40,7 +40,7 @@
 *
 * (code doesn't rely on that order so it could be switched around)
 * ->tasklist_lock
- *   anon_vma->lock      (memory_failure, collect_procs_anon)
+ *   anon_vma->mutex      (memory_failure, collect_procs_anon)
 *     pte map lock
 */
@@ -86,6 +86,29 @@ static inline struct anon_vma *anon_vma_alloc(void)
 static inline void anon_vma_free(struct anon_vma *anon_vma)
 {
        VM_BUG_ON(atomic_read(&anon_vma->refcount));
+        /*
+         * Synchronize against page_lock_anon_vma() such that
+         * we can safely hold the lock without the anon_vma getting
+         * freed.
+         *
+         * Relies on the full mb implied by the atomic_dec_and_test() from
+         * put_anon_vma() against the acquire barrier implied by
+         * mutex_trylock() from page_lock_anon_vma(). This orders:
+         *
+         * page_lock_anon_vma()         VS      put_anon_vma()
+         *   mutex_trylock()                      atomic_dec_and_test()
+         *   LOCK                                 MB
+         *   atomic_read()                        mutex_is_locked()
+         *
+         * LOCK should suffice since the actual taking of the lock must
+         * happen _before_ what follows.
+         */
+        if (mutex_is_locked(&anon_vma->root->mutex)) {
+                anon_vma_lock(anon_vma);
+                anon_vma_unlock(anon_vma);
+        }
        kmem_cache_free(anon_vma_cachep, anon_vma);
 }
@@ -307,7 +330,7 @@ static void anon_vma_ctor(void *data)
 {
        struct anon_vma *anon_vma = data;
-        spin_lock_init(&anon_vma->lock);
+        mutex_init(&anon_vma->mutex);
        atomic_set(&anon_vma->refcount, 0);
        INIT_LIST_HEAD(&anon_vma->head);
 }
@@ -320,12 +343,26 @@ void __init anon_vma_init(void)
 }
 /*
- * Getting a lock on a stable anon_vma from a page off the LRU is
+ * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
- * tricky: page_lock_anon_vma rely on RCU to guard against the races.
+ *
+ * Since there is no serialization what so ever against page_remove_rmap()
+ * the best this function can do is return a locked anon_vma that might
+ * have been relevant to this page.
+ *
+ * The page might have been remapped to a different anon_vma or the anon_vma
+ * returned may already be freed (and even reused).
+ *
+ * All users of this function must be very careful when walking the anon_vma
+ * chain and verify that the page in question is indeed mapped in it
+ * [ something equivalent to page_mapped_in_vma() ].
+ *
+ * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
+ * that the anon_vma pointer from page->mapping is valid if there is a
+ * mapcount, we can dereference the anon_vma after observing those.
 */
-struct anon_vma *__page_lock_anon_vma(struct page *page)
+struct anon_vma *page_get_anon_vma(struct page *page)
 {
-        struct anon_vma *anon_vma, *root_anon_vma;
+        struct anon_vma *anon_vma = NULL;
        unsigned long anon_mapping;
        rcu_read_lock();
@@ -336,32 +373,97 @@ struct anon_vma *__page_lock_anon_vma(struct page *page)
                goto out;
        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
-        root_anon_vma = ACCESS_ONCE(anon_vma->root);
+        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
-        spin_lock(&root_anon_vma->lock);
+                anon_vma = NULL;
+                goto out;
+        }
        /*
         * If this page is still mapped, then its anon_vma cannot have been
-         * freed.  But if it has been unmapped, we have no security against
+         * freed.  But if it has been unmapped, we have no security against the
-         * the anon_vma structure being freed and reused (for another anon_vma:
+         * anon_vma structure being freed and reused (for another anon_vma:
-         * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot
+         * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
-         * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting
+         * above cannot corrupt).
-         * anon_vma->root before page_unlock_anon_vma() is called to unlock.
         */
-        if (page_mapped(page))
+        if (!page_mapped(page)) {
-                return anon_vma;
+                put_anon_vma(anon_vma);
+                anon_vma = NULL;
+        }
+out:
+        rcu_read_unlock();
+        return anon_vma;
+}
+/*
+ * Similar to page_get_anon_vma() except it locks the anon_vma.
+ *
+ * Its a little more complex as it tries to keep the fast path to a single
+ * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
+ * reference like with page_get_anon_vma() and then block on the mutex.
+ */
+struct anon_vma *page_lock_anon_vma(struct page *page)
+{
+        struct anon_vma *anon_vma = NULL;
+        unsigned long anon_mapping;
+        rcu_read_lock();
+        anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+                goto out;
+        if (!page_mapped(page))
+                goto out;
+        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+        if (mutex_trylock(&anon_vma->root->mutex)) {
+                /*
+                 * If we observe a !0 refcount, then holding the lock ensures
+                 * the anon_vma will not go away, see __put_anon_vma().
+                 */
+                if (!atomic_read(&anon_vma->refcount)) {
+                        anon_vma_unlock(anon_vma);
+                        anon_vma = NULL;
+                }
+                goto out;
+        }
+        /* trylock failed, we got to sleep */
+        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+                anon_vma = NULL;
+                goto out;
+        }
+        if (!page_mapped(page)) {
+                put_anon_vma(anon_vma);
+                anon_vma = NULL;
+                goto out;
+        }
+        /* we pinned the anon_vma, its safe to sleep */
+        rcu_read_unlock();
+        anon_vma_lock(anon_vma);
+        if (atomic_dec_and_test(&anon_vma->refcount)) {
+                /*
+                 * Oops, we held the last refcount, release the lock
+                 * and bail -- can't simply use put_anon_vma() because
+                 * we'll deadlock on the anon_vma_lock() recursion.
+                 */
+                anon_vma_unlock(anon_vma);
+                __put_anon_vma(anon_vma);
+                anon_vma = NULL;
+        }
+        return anon_vma;
-        spin_unlock(&root_anon_vma->lock);
 out:
        rcu_read_unlock();
-        return NULL;
+        return anon_vma;
 }
 void page_unlock_anon_vma(struct anon_vma *anon_vma)
-        __releases(&anon_vma->root->lock)
-        __releases(RCU)
 {
        anon_vma_unlock(anon_vma);
-        rcu_read_unlock();
 }
 /*
@@ -646,14 +748,14 @@ static int page_referenced_file(struct page *page,
         * The page lock not only makes sure that page->mapping cannot
         * suddenly be NULLified by truncation, it makes sure that the
         * structure at mapping cannot be freed and reused yet,
-         * so we can safely take mapping->i_mmap_lock.
+         * so we can safely take mapping->i_mmap_mutex.
         */
        BUG_ON(!PageLocked(page));
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        /*
-         * i_mmap_lock does not stabilize mapcount at all, but mapcount
+         * i_mmap_mutex does not stabilize mapcount at all, but mapcount
         * is more likely to be accurate if we note it after spinning.
         */
        mapcount = page_mapcount(page);
@@ -675,7 +777,7 @@ static int page_referenced_file(struct page *page,
                        break;
        }
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        return referenced;
 }
@@ -719,7 +821,7 @@ int page_referenced(struct page *page,
                        unlock_page(page);
        }
 out:
-        if (page_test_and_clear_young(page))
+        if (page_test_and_clear_young(page_to_pfn(page)))
                referenced++;
        return referenced;
@@ -762,7 +864,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
        BUG_ON(PageAnon(page));
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                if (vma->vm_flags & VM_SHARED) {
                        unsigned long address = vma_address(page, vma);
@@ -771,7 +873,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
                        ret += page_mkclean_one(page, vma, address);
                }
        }
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        return ret;
 }
@@ -785,10 +887,8 @@ int page_mkclean(struct page *page)
                struct address_space *mapping = page_mapping(page);
                if (mapping) {
                        ret = page_mkclean_file(mapping, page);
-                        if (page_test_dirty(page)) {
+                        if (page_test_and_clear_dirty(page_to_pfn(page), 1))
-                                page_clear_dirty(page, 1);
                                ret = 1;
-                        }
                }
        }
@@ -981,10 +1081,9 @@ void page_remove_rmap(struct page *page)
         * not if it's in swapcache - there might be another pte slot
         * containing the swap entry, but page not yet written to swap.
         */
-        if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
+        if ((!PageAnon(page) || PageSwapCache(page)) &&
-                page_clear_dirty(page, 1);
+            page_test_and_clear_dirty(page_to_pfn(page), 1))
                set_page_dirty(page);
-        }
        /*
         * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
         * and not charged by memcg for now.
@@ -1122,7 +1221,7 @@ out_mlock:
        /*
         * We need mmap_sem locking, Otherwise VM_LOCKED check makes
         * unstable result and race. Plus, We can't wait here because
-         * we now hold anon_vma->lock or mapping->i_mmap_lock.
+         * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
         * if trylock failed, the page remain in evictable lru and later
         * vmscan could retry to move the page to unevictable lru if the
         * page is actually mlocked.
@@ -1348,7 +1447,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        unsigned long max_nl_size = 0;
        unsigned int mapcount;
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
                if (address == -EFAULT)
@@ -1394,7 +1493,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        mapcount = page_mapcount(page);
        if (!mapcount)
                goto out;
-        cond_resched_lock(&mapping->i_mmap_lock);
+        cond_resched();
        max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
        if (max_nl_cursor == 0)
@@ -1416,7 +1515,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
                        }
                        vma->vm_private_data = (void *) max_nl_cursor;
                }
-                cond_resched_lock(&mapping->i_mmap_lock);
+                cond_resched();
                max_nl_cursor += CLUSTER_SIZE;
        } while (max_nl_cursor <= max_nl_size);
@@ -1428,7 +1527,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
                vma->vm_private_data = NULL;
 out:
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        return ret;
 }
@@ -1547,7 +1646,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
        if (!mapping)
                return ret;
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
                if (address == -EFAULT)
@@ -1561,7 +1660,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
         * never contain migration ptes.  Decide what to do about this
         * limitation to linear when we need rmap_walk() on nonlinear.
         */
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        return ret;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index dfc7069102ee..1acfb2687bfa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -99,6 +99,13 @@ static struct vfsmount *shm_mnt;
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
+struct shmem_xattr {
+        struct list_head list;  /* anchored by shmem_inode_info->xattr_list */
+        char *name;             /* xattr name */
+        size_t size;
+        char value[0];
+};
 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
 enum sgp_type {
        SGP_READ,       /* don't exceed i_size, don't allocate page */
@@ -822,6 +829,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 static void shmem_evict_inode(struct inode *inode)
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
+        struct shmem_xattr *xattr, *nxattr;
        if (inode->i_mapping->a_ops == &shmem_aops) {
                truncate_inode_pages(inode->i_mapping, 0);
@@ -834,6 +842,11 @@ static void shmem_evict_inode(struct inode *inode)
                        mutex_unlock(&shmem_swaplist_mutex);
                }
        }
+        list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
+                kfree(xattr->name);
+                kfree(xattr);
+        }
        BUG_ON(inode->i_blocks);
        shmem_free_inode(inode->i_sb);
        end_writeback(inode);
@@ -916,11 +929,12 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
                        if (size > ENTRIES_PER_PAGE)
                                size = ENTRIES_PER_PAGE;
                        offset = shmem_find_swp(entry, ptr, ptr+size);
+                        shmem_swp_unmap(ptr);
                        if (offset >= 0) {
                                shmem_dir_unmap(dir);
+                                ptr = shmem_swp_map(subdir);
                                goto found;
                        }
-                        shmem_swp_unmap(ptr);
                }
        }
 lost1:
@@ -1291,12 +1305,10 @@ repeat:
                swappage = lookup_swap_cache(swap);
                if (!swappage) {
                        shmem_swp_unmap(entry);
+                        spin_unlock(&info->lock);
                        /* here we actually do the io */
-                        if (type && !(*type & VM_FAULT_MAJOR)) {
+                        if (type)
-                                __count_vm_event(PGMAJFAULT);
                                *type |= VM_FAULT_MAJOR;
-                        }
-                        spin_unlock(&info->lock);
                        swappage = shmem_swapin(swap, gfp, info, idx);
                        if (!swappage) {
                                spin_lock(&info->lock);
@@ -1535,7 +1547,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
        if (error)
                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
+        if (ret & VM_FAULT_MAJOR) {
+                count_vm_event(PGMAJFAULT);
+                mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+        }
        return ret | VM_FAULT_LOCKED;
 }
@@ -1614,6 +1629,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
                spin_lock_init(&info->lock);
                info->flags = flags & VM_NORESERVE;
                INIT_LIST_HEAD(&info->swaplist);
+                INIT_LIST_HEAD(&info->xattr_list);
                cache_no_acl(inode);
                switch (mode & S_IFMT) {
@@ -2013,9 +2029,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        info = SHMEM_I(inode);
        inode->i_size = len-1;
-        if (len <= (char *)inode - (char *)info) {
+        if (len <= SHMEM_SYMLINK_INLINE_LEN) {
                /* do it inline */
-                memcpy(info, symname, len);
+                memcpy(info->inline_symlink, symname, len);
                inode->i_op = &shmem_symlink_inline_operations;
        } else {
                error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
@@ -2041,7 +2057,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
 {
-        nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
+        nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
        return NULL;
 }
@@ -2065,63 +2081,253 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
        }
 }
-static const struct inode_operations shmem_symlink_inline_operations = {
+#ifdef CONFIG_TMPFS_XATTR
-        .readlink       = generic_readlink,
-        .follow_link    = shmem_follow_link_inline,
-};
-static const struct inode_operations shmem_symlink_inode_operations = {
-        .readlink       = generic_readlink,
-        .follow_link    = shmem_follow_link,
-        .put_link       = shmem_put_link,
-};
-#ifdef CONFIG_TMPFS_POSIX_ACL
 /*
- * Superblocks without xattr inode operations will get security.* xattr
+ * Superblocks without xattr inode operations may get some security.* xattr
- * support from the VFS "for free". As soon as we have any other xattrs
+ * support from the LSM "for free". As soon as we have any other xattrs
 * like ACLs, we also need to implement the security.* handlers at
 * filesystem level, though.
 */
-static size_t shmem_xattr_security_list(struct dentry *dentry, char *list,
+static int shmem_xattr_get(struct dentry *dentry, const char *name,
-                                        size_t list_len, const char *name,
+                           void *buffer, size_t size)
-                                        size_t name_len, int handler_flags)
 {
-        return security_inode_listsecurity(dentry->d_inode, list, list_len);
+        struct shmem_inode_info *info;
-}
+        struct shmem_xattr *xattr;
+        int ret = -ENODATA;
-static int shmem_xattr_security_get(struct dentry *dentry, const char *name,
+        info = SHMEM_I(dentry->d_inode);
-                void *buffer, size_t size, int handler_flags)
-{
+        spin_lock(&info->lock);
-        if (strcmp(name, "") == 0)
+        list_for_each_entry(xattr, &info->xattr_list, list) {
-                return -EINVAL;
+                if (strcmp(name, xattr->name))
-        return xattr_getsecurity(dentry->d_inode, name, buffer, size);
+                        continue;
+                ret = xattr->size;
+                if (buffer) {
+                        if (size < xattr->size)
+                                ret = -ERANGE;
+                        else
+                                memcpy(buffer, xattr->value, xattr->size);
+                }
+                break;
+        }
+        spin_unlock(&info->lock);
+        return ret;
 }
-static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
+static int shmem_xattr_set(struct dentry *dentry, const char *name,
-                const void *value, size_t size, int flags, int handler_flags)
+                           const void *value, size_t size, int flags)
 {
-        if (strcmp(name, "") == 0)
+        struct inode *inode = dentry->d_inode;
-                return -EINVAL;
+        struct shmem_inode_info *info = SHMEM_I(inode);
-        return security_inode_setsecurity(dentry->d_inode, name, value,
+        struct shmem_xattr *xattr;
-                                          size, flags);
+        struct shmem_xattr *new_xattr = NULL;
+        size_t len;
+        int err = 0;
+        /* value == NULL means remove */
+        if (value) {
+                /* wrap around? */
+                len = sizeof(*new_xattr) + size;
+                if (len <= sizeof(*new_xattr))
+                        return -ENOMEM;
+                new_xattr = kmalloc(len, GFP_KERNEL);
+                if (!new_xattr)
+                        return -ENOMEM;
+                new_xattr->name = kstrdup(name, GFP_KERNEL);
+                if (!new_xattr->name) {
+                        kfree(new_xattr);
+                        return -ENOMEM;
+                }
+                new_xattr->size = size;
+                memcpy(new_xattr->value, value, size);
+        }
+        spin_lock(&info->lock);
+        list_for_each_entry(xattr, &info->xattr_list, list) {
+                if (!strcmp(name, xattr->name)) {
+                        if (flags & XATTR_CREATE) {
+                                xattr = new_xattr;
+                                err = -EEXIST;
+                        } else if (new_xattr) {
+                                list_replace(&xattr->list, &new_xattr->list);
+                        } else {
+                                list_del(&xattr->list);
+                        }
+                        goto out;
+                }
+        }
+        if (flags & XATTR_REPLACE) {
+                xattr = new_xattr;
+                err = -ENODATA;
+        } else {
+                list_add(&new_xattr->list, &info->xattr_list);
+                xattr = NULL;
+        }
+out:
+        spin_unlock(&info->lock);
+        if (xattr)
+                kfree(xattr->name);
+        kfree(xattr);
+        return err;
 }
-static const struct xattr_handler shmem_xattr_security_handler = {
-        .prefix = XATTR_SECURITY_PREFIX,
-        .list   = shmem_xattr_security_list,
-        .get    = shmem_xattr_security_get,
-        .set    = shmem_xattr_security_set,
-};
 static const struct xattr_handler *shmem_xattr_handlers[] = {
+#ifdef CONFIG_TMPFS_POSIX_ACL
        &generic_acl_access_handler,
        &generic_acl_default_handler,
-        &shmem_xattr_security_handler,
+#endif
        NULL
 };
+static int shmem_xattr_validate(const char *name)
+{
+        struct { const char *prefix; size_t len; } arr[] = {
+                { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
+                { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
+        };
+        int i;
+        for (i = 0; i < ARRAY_SIZE(arr); i++) {
+                size_t preflen = arr[i].len;
+                if (strncmp(name, arr[i].prefix, preflen) == 0) {
+                        if (!name[preflen])
+                                return -EINVAL;
+                        return 0;
+                }
+        }
+        return -EOPNOTSUPP;
+}
+static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
+                              void *buffer, size_t size)
+{
+        int err;
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_getxattr(dentry, name, buffer, size);
+        err = shmem_xattr_validate(name);
+        if (err)
+                return err;
+        return shmem_xattr_get(dentry, name, buffer, size);
+}
+static int shmem_setxattr(struct dentry *dentry, const char *name,
+                          const void *value, size_t size, int flags)
+{
+        int err;
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_setxattr(dentry, name, value, size, flags);
+        err = shmem_xattr_validate(name);
+        if (err)
+                return err;
+        if (size == 0)
+                value = "";  /* empty EA, do not remove */
+        return shmem_xattr_set(dentry, name, value, size, flags);
+}
+static int shmem_removexattr(struct dentry *dentry, const char *name)
+{
+        int err;
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_removexattr(dentry, name);
+        err = shmem_xattr_validate(name);
+        if (err)
+                return err;
+        return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE);
+}
+static bool xattr_is_trusted(const char *name)
+{
+        return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+}
+static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+        bool trusted = capable(CAP_SYS_ADMIN);
+        struct shmem_xattr *xattr;
+        struct shmem_inode_info *info;
+        size_t used = 0;
+        info = SHMEM_I(dentry->d_inode);
+        spin_lock(&info->lock);
+        list_for_each_entry(xattr, &info->xattr_list, list) {
+                size_t len;
+                /* skip "trusted." attributes for unprivileged callers */
+                if (!trusted && xattr_is_trusted(xattr->name))
+                        continue;
+                len = strlen(xattr->name) + 1;
+                used += len;
+                if (buffer) {
+                        if (size < used) {
+                                used = -ERANGE;
+                                break;
+                        }
+                        memcpy(buffer, xattr->name, len);
+                        buffer += len;
+                }
+        }
+        spin_unlock(&info->lock);
+        return used;
+}
+#endif /* CONFIG_TMPFS_XATTR */
+static const struct inode_operations shmem_symlink_inline_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = shmem_follow_link_inline,
+#ifdef CONFIG_TMPFS_XATTR
+        .setxattr       = shmem_setxattr,
+        .getxattr       = shmem_getxattr,
+        .listxattr      = shmem_listxattr,
+        .removexattr    = shmem_removexattr,
 #endif
+};
+static const struct inode_operations shmem_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = shmem_follow_link,
+        .put_link       = shmem_put_link,
+#ifdef CONFIG_TMPFS_XATTR
+        .setxattr       = shmem_setxattr,
+        .getxattr       = shmem_getxattr,
+        .listxattr      = shmem_listxattr,
+        .removexattr    = shmem_removexattr,
+#endif
+};
 static struct dentry *shmem_get_parent(struct dentry *child)
 {
@@ -2401,8 +2607,10 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = TMPFS_MAGIC;
        sb->s_op = &shmem_ops;
        sb->s_time_gran = 1;
-#ifdef CONFIG_TMPFS_POSIX_ACL
+#ifdef CONFIG_TMPFS_XATTR
        sb->s_xattr = shmem_xattr_handlers;
+#endif
+#ifdef CONFIG_TMPFS_POSIX_ACL
        sb->s_flags |= MS_POSIXACL;
 #endif
@@ -2500,11 +2708,13 @@ static const struct file_operations shmem_file_operations = {
 static const struct inode_operations shmem_inode_operations = {
        .setattr        = shmem_notify_change,
        .truncate_range = shmem_truncate_range,
+#ifdef CONFIG_TMPFS_XATTR
+        .setxattr       = shmem_setxattr,
+        .getxattr       = shmem_getxattr,
+        .listxattr      = shmem_listxattr,
+        .removexattr    = shmem_removexattr,
+#endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
-        .setxattr       = generic_setxattr,
-        .getxattr       = generic_getxattr,
-        .listxattr      = generic_listxattr,
-        .removexattr    = generic_removexattr,
        .check_acl      = generic_check_acl,
 #endif
@@ -2522,23 +2732,27 @@ static const struct inode_operations shmem_dir_inode_operations = {
        .mknod          = shmem_mknod,
        .rename         = shmem_rename,
 #endif
+#ifdef CONFIG_TMPFS_XATTR
+        .setxattr       = shmem_setxattr,
+        .getxattr       = shmem_getxattr,
+        .listxattr      = shmem_listxattr,
+        .removexattr    = shmem_removexattr,
+#endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_notify_change,
-        .setxattr       = generic_setxattr,
-        .getxattr       = generic_getxattr,
-        .listxattr      = generic_listxattr,
-        .removexattr    = generic_removexattr,
        .check_acl      = generic_check_acl,
 #endif
 };
 static const struct inode_operations shmem_special_inode_operations = {
+#ifdef CONFIG_TMPFS_XATTR
+        .setxattr       = shmem_setxattr,
+        .getxattr       = shmem_getxattr,
+        .listxattr      = shmem_listxattr,
+        .removexattr    = shmem_removexattr,
+#endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_notify_change,
-        .setxattr       = generic_setxattr,
-        .getxattr       = generic_getxattr,
-        .listxattr      = generic_listxattr,
-        .removexattr    = generic_removexattr,
        .check_acl      = generic_check_acl,
 #endif
 };
diff --git a/mm/slab.c b/mm/slab.c
index 46a9c163a92f..bcfa4987c8ae 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -115,6 +115,7 @@
 #include        <linux/debugobjects.h>
 #include        <linux/kmemcheck.h>
 #include        <linux/memory.h>
+#include        <linux/prefetch.h>
 #include        <asm/cacheflush.h>
 #include        <asm/tlbflush.h>
diff --git a/mm/slub.c b/mm/slub.c
index 9d2e5e46bf09..7be0223531b0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -261,6 +261,18 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
        return *(void **)(object + s->offset);
 }
+static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
+{
+        void *p;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+        probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
+#else
+        p = get_freepointer(s, object);
+#endif
+        return p;
+}
 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 {
        *(void **)(object + s->offset) = fp;
@@ -271,10 +283,6 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
        for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
                        __p += (__s)->size)
-/* Scan freelist */
-#define for_each_free_object(__p, __s, __free) \
-        for (__p = (__free); __p; __p = get_freepointer((__s), __p))
 /* Determine object index from a given position */
 static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 {
@@ -332,6 +340,21 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
 #ifdef CONFIG_SLUB_DEBUG
 /*
+ * Determine a map of object in use on a page.
+ *
+ * Slab lock or node listlock must be held to guarantee that the page does
+ * not vanish from under us.
+ */
+static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
+{
+        void *p;
+        void *addr = page_address(page);
+        for (p = page->freelist; p; p = get_freepointer(s, p))
+                set_bit(slab_index(p, s, addr), map);
+}
+/*
 * Debug settings:
 */
 #ifdef CONFIG_SLUB_DEBUG_ON
@@ -1487,7 +1510,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
        int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
        page = get_partial_node(get_node(s, searchnode));
-        if (page || node != -1)
+        if (page || node != NUMA_NO_NODE)
                return page;
        return get_any_partial(s, flags);
@@ -1540,7 +1563,6 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
        }
 }
-#ifdef CONFIG_CMPXCHG_LOCAL
 #ifdef CONFIG_PREEMPT
 /*
 * Calculate the next globally unique transaction for disambiguiation
@@ -1600,17 +1622,12 @@ static inline void note_cmpxchg_failure(const char *n,
        stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
 }
-#endif
 void init_kmem_cache_cpus(struct kmem_cache *s)
 {
-#ifdef CONFIG_CMPXCHG_LOCAL
        int cpu;
        for_each_possible_cpu(cpu)
                per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
-#endif
 }
 /*
 * Remove the cpu slab
@@ -1643,9 +1660,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
                page->inuse--;
        }
        c->page = NULL;
-#ifdef CONFIG_CMPXCHG_LOCAL
        c->tid = next_tid(c->tid);
-#endif
        unfreeze_slab(s, page, tail);
 }
@@ -1779,8 +1794,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
                          unsigned long addr, struct kmem_cache_cpu *c)
 {
        void **object;
-        struct page *new;
+        struct page *page;
-#ifdef CONFIG_CMPXCHG_LOCAL
        unsigned long flags;
        local_irq_save(flags);
@@ -1792,37 +1806,34 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
         */
        c = this_cpu_ptr(s->cpu_slab);
 #endif
-#endif
        /* We handle __GFP_ZERO in the caller */
        gfpflags &= ~__GFP_ZERO;
-        if (!c->page)
+        page = c->page;
+        if (!page)
                goto new_slab;
-        slab_lock(c->page);
+        slab_lock(page);
        if (unlikely(!node_match(c, node)))
                goto another_slab;
        stat(s, ALLOC_REFILL);
 load_freelist:
-        object = c->page->freelist;
+        object = page->freelist;
        if (unlikely(!object))
                goto another_slab;
        if (kmem_cache_debug(s))
                goto debug;
        c->freelist = get_freepointer(s, object);
-        c->page->inuse = c->page->objects;
+        page->inuse = page->objects;
-        c->page->freelist = NULL;
+        page->freelist = NULL;
-        c->node = page_to_nid(c->page);
-unlock_out:
+        slab_unlock(page);
-        slab_unlock(c->page);
-#ifdef CONFIG_CMPXCHG_LOCAL
        c->tid = next_tid(c->tid);
        local_irq_restore(flags);
-#endif
        stat(s, ALLOC_SLOWPATH);
        return object;
@@ -1830,10 +1841,11 @@ another_slab:
        deactivate_slab(s, c);
 new_slab:
-        new = get_partial(s, gfpflags, node);
+        page = get_partial(s, gfpflags, node);
-        if (new) {
+        if (page) {
-                c->page = new;
                stat(s, ALLOC_FROM_PARTIAL);
+                c->node = page_to_nid(page);
+                c->page = page;
                goto load_freelist;
        }
@@ -1841,35 +1853,38 @@ new_slab:
        if (gfpflags & __GFP_WAIT)
                local_irq_enable();
-        new = new_slab(s, gfpflags, node);
+        page = new_slab(s, gfpflags, node);
        if (gfpflags & __GFP_WAIT)
                local_irq_disable();
-        if (new) {
+        if (page) {
                c = __this_cpu_ptr(s->cpu_slab);
                stat(s, ALLOC_SLAB);
                if (c->page)
                        flush_slab(s, c);
-                slab_lock(new);
-                __SetPageSlubFrozen(new);
+                slab_lock(page);
-                c->page = new;
+                __SetPageSlubFrozen(page);
+                c->node = page_to_nid(page);
+                c->page = page;
                goto load_freelist;
        }
        if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
                slab_out_of_memory(s, gfpflags, node);
-#ifdef CONFIG_CMPXCHG_LOCAL
        local_irq_restore(flags);
-#endif
        return NULL;
 debug:
-        if (!alloc_debug_processing(s, c->page, object, addr))
+        if (!alloc_debug_processing(s, page, object, addr))
                goto another_slab;
-        c->page->inuse++;
+        page->inuse++;
-        c->page->freelist = get_freepointer(s, object);
+        page->freelist = get_freepointer(s, object);
+        deactivate_slab(s, c);
+        c->page = NULL;
        c->node = NUMA_NO_NODE;
-        goto unlock_out;
+        local_irq_restore(flags);
+        return object;
 }
 /*
@@ -1887,20 +1902,12 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 {
        void **object;
        struct kmem_cache_cpu *c;
-#ifdef CONFIG_CMPXCHG_LOCAL
        unsigned long tid;
-#else
-        unsigned long flags;
-#endif
        if (slab_pre_alloc_hook(s, gfpflags))
                return NULL;
-#ifndef CONFIG_CMPXCHG_LOCAL
-        local_irq_save(flags);
-#else
 redo:
-#endif
        /*
         * Must read kmem_cache cpu data via this cpu ptr. Preemption is
@@ -1910,7 +1917,6 @@ redo:
         */
        c = __this_cpu_ptr(s->cpu_slab);
-#ifdef CONFIG_CMPXCHG_LOCAL
        /*
         * The transaction ids are globally unique per cpu and per operation on
         * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
@@ -1919,7 +1925,6 @@ redo:
         */
        tid = c->tid;
        barrier();
-#endif
        object = c->freelist;
        if (unlikely(!object || !node_match(c, node)))
@@ -1927,7 +1932,6 @@ redo:
                object = __slab_alloc(s, gfpflags, node, addr, c);
        else {
-#ifdef CONFIG_CMPXCHG_LOCAL
                /*
                 * The cmpxchg will only match if there was no additional
                 * operation and if we are on the right processor.
@@ -1943,21 +1947,14 @@ redo:
                if (unlikely(!irqsafe_cpu_cmpxchg_double(
                                s->cpu_slab->freelist, s->cpu_slab->tid,
                                object, tid,
-                                get_freepointer(s, object), next_tid(tid)))) {
+                                get_freepointer_safe(s, object), next_tid(tid)))) {
                        note_cmpxchg_failure("slab_alloc", s, tid);
                        goto redo;
                }
-#else
-                c->freelist = get_freepointer(s, object);
-#endif
                stat(s, ALLOC_FASTPATH);
        }
-#ifndef CONFIG_CMPXCHG_LOCAL
-        local_irq_restore(flags);
-#endif
        if (unlikely(gfpflags & __GFP_ZERO) && object)
                memset(object, 0, s->objsize);
@@ -2034,18 +2031,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 {
        void *prior;
        void **object = (void *)x;
-#ifdef CONFIG_CMPXCHG_LOCAL
        unsigned long flags;
        local_irq_save(flags);
-#endif
        slab_lock(page);
        stat(s, FREE_SLOWPATH);
-        if (kmem_cache_debug(s))
+        if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
-                goto debug;
+                goto out_unlock;
-checks_ok:
        prior = page->freelist;
        set_freepointer(s, object, prior);
        page->freelist = object;
@@ -2070,9 +2064,7 @@ checks_ok:
 out_unlock:
        slab_unlock(page);
-#ifdef CONFIG_CMPXCHG_LOCAL
        local_irq_restore(flags);
-#endif
        return;
 slab_empty:
@@ -2084,17 +2076,9 @@ slab_empty:
                stat(s, FREE_REMOVE_PARTIAL);
        }
        slab_unlock(page);
-#ifdef CONFIG_CMPXCHG_LOCAL
        local_irq_restore(flags);
-#endif
        stat(s, FREE_SLAB);
        discard_slab(s, page);
-        return;
-debug:
-        if (!free_debug_processing(s, page, x, addr))
-                goto out_unlock;
-        goto checks_ok;
 }
 /*
@@ -2113,20 +2097,11 @@ static __always_inline void slab_free(struct kmem_cache *s,
 {
        void **object = (void *)x;
        struct kmem_cache_cpu *c;
-#ifdef CONFIG_CMPXCHG_LOCAL
        unsigned long tid;
-#else
-        unsigned long flags;
-#endif
        slab_free_hook(s, x);
-#ifndef CONFIG_CMPXCHG_LOCAL
-        local_irq_save(flags);
-#else
 redo:
-#endif
        /*
         * Determine the currently cpus per cpu slab.
@@ -2136,15 +2111,12 @@ redo:
         */
        c = __this_cpu_ptr(s->cpu_slab);
-#ifdef CONFIG_CMPXCHG_LOCAL
        tid = c->tid;
        barrier();
-#endif
-        if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
+        if (likely(page == c->page)) {
                set_freepointer(s, object, c->freelist);
-#ifdef CONFIG_CMPXCHG_LOCAL
                if (unlikely(!irqsafe_cpu_cmpxchg_double(
                                s->cpu_slab->freelist, s->cpu_slab->tid,
                                c->freelist, tid,
@@ -2153,16 +2125,10 @@ redo:
                        note_cmpxchg_failure("slab_free", s, tid);
                        goto redo;
                }
-#else
-                c->freelist = object;
-#endif
                stat(s, FREE_FASTPATH);
        } else
                __slab_free(s, page, x, addr);
-#ifndef CONFIG_CMPXCHG_LOCAL
-        local_irq_restore(flags);
-#endif
 }
 void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -2673,9 +2639,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
                return;
        slab_err(s, page, "%s", text);
        slab_lock(page);
-        for_each_free_object(p, s, page->freelist)
-                set_bit(slab_index(p, s, addr), map);
+        get_map(s, page, map);
        for_each_object(p, s, addr, page->objects) {
                if (!test_bit(slab_index(p, s, addr), map)) {
@@ -3203,7 +3168,7 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
                        list_for_each_entry(p, &n->partial, lru)
                                p->slab = s;
-#ifdef CONFIG_SLAB_DEBUG
+#ifdef CONFIG_SLUB_DEBUG
                        list_for_each_entry(p, &n->full, lru)
                                p->slab = s;
 #endif
@@ -3610,10 +3575,11 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
        /* Now we know that a valid freelist exists */
        bitmap_zero(map, page->objects);
-        for_each_free_object(p, s, page->freelist) {
+        get_map(s, page, map);
-                set_bit(slab_index(p, s, addr), map);
+        for_each_object(p, s, addr, page->objects) {
-                if (!check_object(s, page, p, SLUB_RED_INACTIVE))
+                if (test_bit(slab_index(p, s, addr), map))
-                        return 0;
+                        if (!check_object(s, page, p, SLUB_RED_INACTIVE))
+                                return 0;
        }
        for_each_object(p, s, addr, page->objects)
@@ -3821,8 +3787,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
        void *p;
        bitmap_zero(map, page->objects);
-        for_each_free_object(p, s, page->freelist)
+        get_map(s, page, map);
-                set_bit(slab_index(p, s, addr), map);
        for_each_object(p, s, addr, page->objects)
                if (!test_bit(slab_index(p, s, addr), map))
diff --git a/mm/swap.c b/mm/swap.c
index 5602f1a1b1e7..3a442f18b0b3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -272,14 +272,10 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page,
                memcg_reclaim_stat->recent_rotated[file]++;
 }
-/*
+static void __activate_page(struct page *page, void *arg)
- * FIXME: speed this up?
- */
-void activate_page(struct page *page)
 {
        struct zone *zone = page_zone(page);
-        spin_lock_irq(&zone->lru_lock);
        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
                int file = page_is_file_cache(page);
                int lru = page_lru_base_type(page);
@@ -292,8 +288,45 @@ void activate_page(struct page *page)
                update_page_reclaim_stat(zone, page, file, 1);
        }
+}
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
+static void activate_page_drain(int cpu)
+{
+        struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
+        if (pagevec_count(pvec))
+                pagevec_lru_move_fn(pvec, __activate_page, NULL);
+}
+void activate_page(struct page *page)
+{
+        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+                struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+                page_cache_get(page);
+                if (!pagevec_add(pvec, page))
+                        pagevec_lru_move_fn(pvec, __activate_page, NULL);
+                put_cpu_var(activate_page_pvecs);
+        }
+}
+#else
+static inline void activate_page_drain(int cpu)
+{
+}
+void activate_page(struct page *page)
+{
+        struct zone *zone = page_zone(page);
+        spin_lock_irq(&zone->lru_lock);
+        __activate_page(page, NULL);
        spin_unlock_irq(&zone->lru_lock);
 }
+#endif
 /*
 * Mark a page as having seen activity.
@@ -464,6 +497,8 @@ static void drain_cpu_pagevecs(int cpu)
        pvec = &per_cpu(lru_deactivate_pvecs, cpu);
        if (pagevec_count(pvec))
                pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+        activate_page_drain(cpu);
 }
 /**
@@ -476,6 +511,13 @@ static void drain_cpu_pagevecs(int cpu)
 */
 void deactivate_page(struct page *page)
 {
+        /*
+         * In a workload with many unevictable page such as mprotect, unevictable
+         * page deactivation for accelerating reclaim is pointless.
+         */
+        if (PageUnevictable(page))
+                return;
        if (likely(get_page_unless_zero(page))) {
                struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8c6b3ce38f09..d537d29e9b7b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,7 @@
 #include <linux/syscalls.h>
 #include <linux/memcontrol.h>
 #include <linux/poll.h>
+#include <linux/oom.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -1555,6 +1556,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        struct address_space *mapping;
        struct inode *inode;
        char *pathname;
+        int oom_score_adj;
        int i, type, prev;
        int err;
@@ -1613,9 +1615,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        p->flags &= ~SWP_WRITEOK;
        spin_unlock(&swap_lock);
-        current->flags |= PF_OOM_ORIGIN;
+        oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
        err = try_to_unuse(type);
-        current->flags &= ~PF_OOM_ORIGIN;
+        test_set_oom_score_adj(oom_score_adj);
        if (err) {
                /*
diff --git a/mm/truncate.c b/mm/truncate.c
index a95667529135..3a29a6180212 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -19,6 +19,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>  /* grr. try_to_release_page,
                                   do_invalidatepage */
+#include <linux/cleancache.h>
 #include "internal.h"
@@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
        zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+        cleancache_flush_page(page->mapping, page);
        if (page_has_private(page))
                do_invalidatepage(page, partial);
 }
@@ -214,6 +216,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
        pgoff_t next;
        int i;
+        cleancache_flush_inode(mapping);
        if (mapping->nrpages == 0)
                return;
@@ -291,6 +294,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                pagevec_release(&pvec);
                mem_cgroup_uncharge_end();
        }
+        cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -440,6 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
        int did_range_unmap = 0;
        int wrapped = 0;
+        cleancache_flush_inode(mapping);
        pagevec_init(&pvec, 0);
        next = start;
        while (next <= end && !wrapped &&
@@ -498,6 +503,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                mem_cgroup_uncharge_end();
                cond_resched();
        }
+        cleancache_flush_inode(mapping);
        return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/mm/util.c b/mm/util.c
index e7b103a6fd21..88ea1bd661c0 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -6,6 +6,8 @@
 #include <linux/sched.h>
 #include <asm/uaccess.h>
+#include "internal.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/kmem.h>
@@ -215,6 +217,28 @@ char *strndup_user(const char __user *s, long n)
 }
 EXPORT_SYMBOL(strndup_user);
+void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+                struct vm_area_struct *prev, struct rb_node *rb_parent)
+{
+        struct vm_area_struct *next;
+        vma->vm_prev = prev;
+        if (prev) {
+                next = prev->vm_next;
+                prev->vm_next = vma;
+        } else {
+                mm->mmap = vma;
+                if (rb_parent)
+                        next = rb_entry(rb_parent,
+                                        struct vm_area_struct, vm_rb);
+                else
+                        next = NULL;
+        }
+        vma->vm_next = next;
+        if (next)
+                next->vm_prev = vma;
+}
 #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fdf4b1e88e53..1d34d75366a7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -375,7 +375,7 @@ nocache:
        /* find starting point for our search */
        if (free_vmap_cache) {
                first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
-                addr = ALIGN(first->va_end + PAGE_SIZE, align);
+                addr = ALIGN(first->va_end, align);
                if (addr < vstart)
                        goto nocache;
                if (addr + size - 1 < addr)
@@ -406,10 +406,10 @@ nocache:
        }
        /* from the starting point, walk areas until a suitable hole is found */
-        while (addr + size >= first->va_start && addr + size <= vend) {
+        while (addr + size > first->va_start && addr + size <= vend) {
                if (addr + cached_hole_size < first->va_start)
                        cached_hole_size = first->va_start - addr;
-                addr = ALIGN(first->va_end + PAGE_SIZE, align);
+                addr = ALIGN(first->va_end, align);
                if (addr + size - 1 < addr)
                        goto overflow;
@@ -1534,6 +1534,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                                 pgprot_t prot, int node, void *caller)
 {
+        const int order = 0;
        struct page **pages;
        unsigned int nr_pages, array_size, i;
        gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
@@ -1560,11 +1561,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        for (i = 0; i < area->nr_pages; i++) {
                struct page *page;
+                gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
                if (node < 0)
-                        page = alloc_page(gfp_mask);
+                        page = alloc_page(tmp_mask);
                else
-                        page = alloc_pages_node(node, gfp_mask, 0);
+                        page = alloc_pages_node(node, tmp_mask, order);
                if (unlikely(!page)) {
                        /* Successfully allocated i pages, free them in __vunmap() */
@@ -1579,6 +1581,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        return area->addr;
 fail:
+        warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, "
+                          "allocated %ld of %ld bytes\n",
+                          (area->nr_pages*PAGE_SIZE), area->size);
        vfree(area->addr);
        return NULL;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8bfd45050a61..faa0a088f9cc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -42,6 +42,7 @@
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
 #include <linux/oom.h>
+#include <linux/prefetch.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -172,7 +173,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
                                struct scan_control *sc, enum lru_list lru)
 {
        if (!scanning_global_lru(sc))
-                return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
+                return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru);
        return zone_page_state(zone, NR_LRU_BASE + lru);
 }
@@ -201,6 +202,14 @@ void unregister_shrinker(struct shrinker *shrinker)
 }
 EXPORT_SYMBOL(unregister_shrinker);
+static inline int do_shrinker_shrink(struct shrinker *shrinker,
+                                     struct shrink_control *sc,
+                                     unsigned long nr_to_scan)
+{
+        sc->nr_to_scan = nr_to_scan;
+        return (*shrinker->shrink)(shrinker, sc);
+}
 #define SHRINK_BATCH 128
 /*
 * Call the shrink functions to age shrinkable caches
@@ -221,25 +230,29 @@ EXPORT_SYMBOL(unregister_shrinker);
 *
 * Returns the number of slab objects which we shrunk.
 */
-unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+unsigned long shrink_slab(struct shrink_control *shrink,
-                        unsigned long lru_pages)
+                          unsigned long nr_pages_scanned,
+                          unsigned long lru_pages)
 {
        struct shrinker *shrinker;
        unsigned long ret = 0;
-        if (scanned == 0)
+        if (nr_pages_scanned == 0)
-                scanned = SWAP_CLUSTER_MAX;
+                nr_pages_scanned = SWAP_CLUSTER_MAX;
-        if (!down_read_trylock(&shrinker_rwsem))
+        if (!down_read_trylock(&shrinker_rwsem)) {
-                return 1;       /* Assume we'll be able to shrink next time */
+                /* Assume we'll be able to shrink next time */
+                ret = 1;
+                goto out;
+        }
        list_for_each_entry(shrinker, &shrinker_list, list) {
                unsigned long long delta;
                unsigned long total_scan;
                unsigned long max_pass;
-                max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
+                max_pass = do_shrinker_shrink(shrinker, shrink, 0);
-                delta = (4 * scanned) / shrinker->seeks;
+                delta = (4 * nr_pages_scanned) / shrinker->seeks;
                delta *= max_pass;
                do_div(delta, lru_pages + 1);
                shrinker->nr += delta;
@@ -266,9 +279,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                        int shrink_ret;
                        int nr_before;
-                        nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
+                        nr_before = do_shrinker_shrink(shrinker, shrink, 0);
-                        shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
+                        shrink_ret = do_shrinker_shrink(shrinker, shrink,
-                                                                gfp_mask);
+                                                        this_scan);
                        if (shrink_ret == -1)
                                break;
                        if (shrink_ret < nr_before)
@@ -282,6 +295,8 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                shrinker->nr += total_scan;
        }
        up_read(&shrinker_rwsem);
+out:
+        cond_resched();
        return ret;
 }
@@ -1201,13 +1216,16 @@ int isolate_lru_page(struct page *page)
 {
        int ret = -EBUSY;
+        VM_BUG_ON(!page_count(page));
        if (PageLRU(page)) {
                struct zone *zone = page_zone(page);
                spin_lock_irq(&zone->lru_lock);
-                if (PageLRU(page) && get_page_unless_zero(page)) {
+                if (PageLRU(page)) {
                        int lru = page_lru(page);
                        ret = 0;
+                        get_page(page);
                        ClearPageLRU(page);
                        del_page_from_lru_list(zone, page, lru);
@@ -1700,26 +1718,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 }
 /*
- * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
- * until we collected @swap_cluster_max pages to scan.
- */
-static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
-                                       unsigned long *nr_saved_scan)
-{
-        unsigned long nr;
-        *nr_saved_scan += nr_to_scan;
-        nr = *nr_saved_scan;
-        if (nr >= SWAP_CLUSTER_MAX)
-                *nr_saved_scan = 0;
-        else
-                nr = 0;
-        return nr;
-}
-/*
 * Determine how aggressively the anon and file LRU lists should be
 * scanned.  The relative value of each set of LRU lists is determined
 * by looking at the fraction of the pages scanned we did rotate back
@@ -1737,6 +1735,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        u64 fraction[2], denominator;
        enum lru_list l;
        int noswap = 0;
+        int force_scan = 0;
+        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+        if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
+                /* kswapd does zone balancing and need to scan this zone */
+                if (scanning_global_lru(sc) && current_is_kswapd())
+                        force_scan = 1;
+                /* memcg may have small limit and need to avoid priority drop */
+                if (!scanning_global_lru(sc))
+                        force_scan = 1;
+        }
        /* If we have no swap space, do not bother scanning anon pages. */
        if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1747,11 +1761,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                goto out;
        }
-        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
-        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
        if (scanning_global_lru(sc)) {
                free  = zone_page_state(zone, NR_FREE_PAGES);
                /* If we have very few page cache pages,
@@ -1818,8 +1827,23 @@ out:
                        scan >>= priority;
                        scan = div64_u64(scan * fraction[file], denominator);
                }
-                nr[l] = nr_scan_try_batch(scan,
-                                          &reclaim_stat->nr_saved_scan[l]);
+                /*
+                 * If zone is small or memcg is small, nr[l] can be 0.
+                 * This results no-scan on this priority and priority drop down.
+                 * For global direct reclaim, it can visit next zone and tend
+                 * not to have problems. For global kswapd, it's for zone
+                 * balancing and it need to scan a small amounts. When using
+                 * memcg, priority drop can cause big latency. So, it's better
+                 * to scan small amount. See may_noscan above.
+                 */
+                if (!scan && force_scan) {
+                        if (file)
+                                scan = SWAP_CLUSTER_MAX;
+                        else if (!noswap)
+                                scan = SWAP_CLUSTER_MAX;
+                }
+                nr[l] = scan;
        }
 }
@@ -1959,11 +1983,14 @@ restart:
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 */
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
        struct zoneref *z;
        struct zone *zone;
+        unsigned long nr_soft_reclaimed;
+        unsigned long nr_soft_scanned;
+        unsigned long total_scanned = 0;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1980,8 +2007,17 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                                continue;       /* Let kswapd poll it */
                }
+                nr_soft_scanned = 0;
+                nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+                                                        sc->order, sc->gfp_mask,
+                                                        &nr_soft_scanned);
+                sc->nr_reclaimed += nr_soft_reclaimed;
+                total_scanned += nr_soft_scanned;
                shrink_zone(priority, zone, sc);
        }
+        return total_scanned;
 }
 static bool zone_reclaimable(struct zone *zone)
@@ -2026,7 +2062,8 @@ static bool all_unreclaimable(struct zonelist *zonelist,
 *              else, the number of pages reclaimed
 */
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
-                                        struct scan_control *sc)
+                                        struct scan_control *sc,
+                                        struct shrink_control *shrink)
 {
        int priority;
        unsigned long total_scanned = 0;
@@ -2045,7 +2082,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                sc->nr_scanned = 0;
                if (!priority)
                        disable_swap_token();
-                shrink_zones(priority, zonelist, sc);
+                total_scanned += shrink_zones(priority, zonelist, sc);
                /*
                 * Don't shrink slabs when reclaiming memory from
                 * over limit cgroups
@@ -2060,7 +2097,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                lru_pages += zone_reclaimable_pages(zone);
                        }
-                        shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+                        shrink_slab(shrink, sc->nr_scanned, lru_pages);
                        if (reclaim_state) {
                                sc->nr_reclaimed += reclaim_state->reclaimed_slab;
                                reclaim_state->reclaimed_slab = 0;
@@ -2132,12 +2169,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .mem_cgroup = NULL,
                .nodemask = nodemask,
        };
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
+        };
        trace_mm_vmscan_direct_reclaim_begin(order,
                                sc.may_writepage,
                                gfp_mask);
-        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
        trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
@@ -2149,9 +2189,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
                                                unsigned int swappiness,
-                                                struct zone *zone)
+                                                struct zone *zone,
+                                                unsigned long *nr_scanned)
 {
        struct scan_control sc = {
+                .nr_scanned = 0,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
@@ -2160,6 +2202,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                .order = 0,
                .mem_cgroup = mem,
        };
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2178,6 +2221,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+        *nr_scanned = sc.nr_scanned;
        return sc.nr_reclaimed;
 }
@@ -2188,6 +2232,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 {
        struct zonelist *zonelist;
        unsigned long nr_reclaimed;
+        int nid;
        struct scan_control sc = {
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
@@ -2197,17 +2242,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .order = 0,
                .mem_cgroup = mem_cont,
                .nodemask = NULL, /* we don't care the placement */
+                .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+                                (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+        };
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
        };
-        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+        /*
-                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+         * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
-        zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+         * take care of from where we get pages. So the node where we start the
+         * scan does not need to be the current node.
+         */
+        nid = mem_cgroup_select_victim_node(mem_cont);
+        zonelist = NODE_DATA(nid)->node_zonelists;
        trace_mm_vmscan_memcg_reclaim_begin(0,
                                            sc.may_writepage,
                                            sc.gfp_mask);
-        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -2286,7 +2341,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
         * must be balanced
         */
        if (order)
-                return pgdat_balanced(pgdat, balanced, classzone_idx);
+                return !pgdat_balanced(pgdat, balanced, classzone_idx);
        else
                return !all_zones_ok;
 }
@@ -2322,6 +2377,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
        unsigned long total_scanned;
        struct reclaim_state *reclaim_state = current->reclaim_state;
+        unsigned long nr_soft_reclaimed;
+        unsigned long nr_soft_scanned;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .may_unmap = 1,
@@ -2335,6 +2392,9 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                .order = order,
                .mem_cgroup = NULL,
        };
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
+        };
 loop_again:
        total_scanned = 0;
        sc.nr_reclaimed = 0;
@@ -2411,11 +2471,15 @@ loop_again:
                        sc.nr_scanned = 0;
+                        nr_soft_scanned = 0;
                        /*
                         * Call soft limit reclaim before calling shrink_zone.
-                         * For now we ignore the return value
                         */
-                        mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
+                        nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+                                                        order, sc.gfp_mask,
+                                                        &nr_soft_scanned);
+                        sc.nr_reclaimed += nr_soft_reclaimed;
+                        total_scanned += nr_soft_scanned;
                        /*
                         * We put equal pressure on every zone, unless
@@ -2434,8 +2498,7 @@ loop_again:
                                        end_zone, 0))
                                shrink_zone(priority, zone, &sc);
                        reclaim_state->reclaimed_slab = 0;
-                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+                        nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
-                                                lru_pages);
                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
                        total_scanned += sc.nr_scanned;
@@ -2787,7 +2850,10 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
                .swappiness = vm_swappiness,
                .order = 0,
        };
-        struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
+        };
+        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
        struct task_struct *p = current;
        unsigned long nr_reclaimed;
@@ -2796,7 +2862,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
-        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
        p->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
@@ -2971,6 +3037,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .swappiness = vm_swappiness,
                .order = order,
        };
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
+        };
        unsigned long nr_slab_pages0, nr_slab_pages1;
        cond_resched();
@@ -3012,7 +3081,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                        unsigned long lru_pages = zone_reclaimable_pages(zone);
                        /* No reclaimable slab or very low memory pressure */
-                        if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+                        if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
                                break;
                        /* Freed enough memory */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 897ea9e88238..20c18b7694b2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -157,7 +157,7 @@ int calculate_normal_threshold(struct zone *zone)
 /*
 * Refresh the thresholds for each zone.
 */
-static void refresh_zone_stat_thresholds(void)
+void refresh_zone_stat_thresholds(void)
 {
        struct zone *zone;
        int cpu;
@@ -659,6 +659,138 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
 }
 #endif
+#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS)
+#ifdef CONFIG_ZONE_DMA
+#define TEXT_FOR_DMA(xx) xx "_dma",
+#else
+#define TEXT_FOR_DMA(xx)
+#endif
+#ifdef CONFIG_ZONE_DMA32
+#define TEXT_FOR_DMA32(xx) xx "_dma32",
+#else
+#define TEXT_FOR_DMA32(xx)
+#endif
+#ifdef CONFIG_HIGHMEM
+#define TEXT_FOR_HIGHMEM(xx) xx "_high",
+#else
+#define TEXT_FOR_HIGHMEM(xx)
+#endif
+#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
+                                        TEXT_FOR_HIGHMEM(xx) xx "_movable",
+const char * const vmstat_text[] = {
+        /* Zoned VM counters */
+        "nr_free_pages",
+        "nr_inactive_anon",
+        "nr_active_anon",
+        "nr_inactive_file",
+        "nr_active_file",
+        "nr_unevictable",
+        "nr_mlock",
+        "nr_anon_pages",
+        "nr_mapped",
+        "nr_file_pages",
+        "nr_dirty",
+        "nr_writeback",
+        "nr_slab_reclaimable",
+        "nr_slab_unreclaimable",
+        "nr_page_table_pages",
+        "nr_kernel_stack",
+        "nr_unstable",
+        "nr_bounce",
+        "nr_vmscan_write",
+        "nr_writeback_temp",
+        "nr_isolated_anon",
+        "nr_isolated_file",
+        "nr_shmem",
+        "nr_dirtied",
+        "nr_written",
+#ifdef CONFIG_NUMA
+        "numa_hit",
+        "numa_miss",
+        "numa_foreign",
+        "numa_interleave",
+        "numa_local",
+        "numa_other",
+#endif
+        "nr_anon_transparent_hugepages",
+        "nr_dirty_threshold",
+        "nr_dirty_background_threshold",
+#ifdef CONFIG_VM_EVENT_COUNTERS
+        "pgpgin",
+        "pgpgout",
+        "pswpin",
+        "pswpout",
+        TEXTS_FOR_ZONES("pgalloc")
+        "pgfree",
+        "pgactivate",
+        "pgdeactivate",
+        "pgfault",
+        "pgmajfault",
+        TEXTS_FOR_ZONES("pgrefill")
+        TEXTS_FOR_ZONES("pgsteal")
+        TEXTS_FOR_ZONES("pgscan_kswapd")
+        TEXTS_FOR_ZONES("pgscan_direct")
+#ifdef CONFIG_NUMA
+        "zone_reclaim_failed",
+#endif
+        "pginodesteal",
+        "slabs_scanned",
+        "kswapd_steal",
+        "kswapd_inodesteal",
+        "kswapd_low_wmark_hit_quickly",
+        "kswapd_high_wmark_hit_quickly",
+        "kswapd_skip_congestion_wait",
+        "pageoutrun",
+        "allocstall",
+        "pgrotated",
+#ifdef CONFIG_COMPACTION
+        "compact_blocks_moved",
+        "compact_pages_moved",
+        "compact_pagemigrate_failed",
+        "compact_stall",
+        "compact_fail",
+        "compact_success",
+#endif
+#ifdef CONFIG_HUGETLB_PAGE
+        "htlb_buddy_alloc_success",
+        "htlb_buddy_alloc_fail",
+#endif
+        "unevictable_pgs_culled",
+        "unevictable_pgs_scanned",
+        "unevictable_pgs_rescued",
+        "unevictable_pgs_mlocked",
+        "unevictable_pgs_munlocked",
+        "unevictable_pgs_cleared",
+        "unevictable_pgs_stranded",
+        "unevictable_pgs_mlockfreed",
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        "thp_fault_alloc",
+        "thp_fault_fallback",
+        "thp_collapse_alloc",
+        "thp_collapse_alloc_failed",
+        "thp_split",
+#endif
+#endif /* CONFIG_VM_EVENTS_COUNTERS */
+};
+#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */
 #ifdef CONFIG_PROC_FS
 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
                                                struct zone *zone)
@@ -831,135 +963,6 @@ static const struct file_operations pagetypeinfo_file_ops = {
        .release        = seq_release,
 };
-#ifdef CONFIG_ZONE_DMA
-#define TEXT_FOR_DMA(xx) xx "_dma",
-#else
-#define TEXT_FOR_DMA(xx)
-#endif
-#ifdef CONFIG_ZONE_DMA32
-#define TEXT_FOR_DMA32(xx) xx "_dma32",
-#else
-#define TEXT_FOR_DMA32(xx)
-#endif
-#ifdef CONFIG_HIGHMEM
-#define TEXT_FOR_HIGHMEM(xx) xx "_high",
-#else
-#define TEXT_FOR_HIGHMEM(xx)
-#endif
-#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
-                                        TEXT_FOR_HIGHMEM(xx) xx "_movable",
-static const char * const vmstat_text[] = {
-        /* Zoned VM counters */
-        "nr_free_pages",
-        "nr_inactive_anon",
-        "nr_active_anon",
-        "nr_inactive_file",
-        "nr_active_file",
-        "nr_unevictable",
-        "nr_mlock",
-        "nr_anon_pages",
-        "nr_mapped",
-        "nr_file_pages",
-        "nr_dirty",
-        "nr_writeback",
-        "nr_slab_reclaimable",
-        "nr_slab_unreclaimable",
-        "nr_page_table_pages",
-        "nr_kernel_stack",
-        "nr_unstable",
-        "nr_bounce",
-        "nr_vmscan_write",
-        "nr_writeback_temp",
-        "nr_isolated_anon",
-        "nr_isolated_file",
-        "nr_shmem",
-        "nr_dirtied",
-        "nr_written",
-#ifdef CONFIG_NUMA
-        "numa_hit",
-        "numa_miss",
-        "numa_foreign",
-        "numa_interleave",
-        "numa_local",
-        "numa_other",
-#endif
-        "nr_anon_transparent_hugepages",
-        "nr_dirty_threshold",
-        "nr_dirty_background_threshold",
-#ifdef CONFIG_VM_EVENT_COUNTERS
-        "pgpgin",
-        "pgpgout",
-        "pswpin",
-        "pswpout",
-        TEXTS_FOR_ZONES("pgalloc")
-        "pgfree",
-        "pgactivate",
-        "pgdeactivate",
-        "pgfault",
-        "pgmajfault",
-        TEXTS_FOR_ZONES("pgrefill")
-        TEXTS_FOR_ZONES("pgsteal")
-        TEXTS_FOR_ZONES("pgscan_kswapd")
-        TEXTS_FOR_ZONES("pgscan_direct")
-#ifdef CONFIG_NUMA
-        "zone_reclaim_failed",
-#endif
-        "pginodesteal",
-        "slabs_scanned",
-        "kswapd_steal",
-        "kswapd_inodesteal",
-        "kswapd_low_wmark_hit_quickly",
-        "kswapd_high_wmark_hit_quickly",
-        "kswapd_skip_congestion_wait",
-        "pageoutrun",
-        "allocstall",
-        "pgrotated",
-#ifdef CONFIG_COMPACTION
-        "compact_blocks_moved",
-        "compact_pages_moved",
-        "compact_pagemigrate_failed",
-        "compact_stall",
-        "compact_fail",
-        "compact_success",
-#endif
-#ifdef CONFIG_HUGETLB_PAGE
-        "htlb_buddy_alloc_success",
-        "htlb_buddy_alloc_fail",
-#endif
-        "unevictable_pgs_culled",
-        "unevictable_pgs_scanned",
-        "unevictable_pgs_rescued",
-        "unevictable_pgs_mlocked",
-        "unevictable_pgs_munlocked",
-        "unevictable_pgs_cleared",
-        "unevictable_pgs_stranded",
-        "unevictable_pgs_mlockfreed",
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        "thp_fault_alloc",
-        "thp_fault_fallback",
-        "thp_collapse_alloc",
-        "thp_collapse_alloc_failed",
-        "thp_split",
-#endif
-#endif /* CONFIG_VM_EVENTS_COUNTERS */
-};
 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                                                        struct zone *zone)
 {
@@ -1198,7 +1201,6 @@ static int __init setup_vmstat(void)
 #ifdef CONFIG_SMP
        int cpu;
-        refresh_zone_stat_thresholds();
        register_cpu_notifier(&vmstat_notifier);
        for_each_online_cpu(cpu)