19 files changed, 390 insertions, 238 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index edd300aca173..44cf6f0a3a6d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -67,7 +67,7 @@ config DISCONTIGMEM
 config SPARSEMEM
        def_bool y
-        depends on SPARSEMEM_MANUAL
+        depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
 config FLATMEM
        def_bool y
@@ -128,11 +128,8 @@ config SPARSEMEM_VMEMMAP
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
        depends on SPARSEMEM || X86_64_ACPI_NUMA
-        depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG
+        depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
-        depends on (IA64 || X86 || PPC64 || SUPERH || S390)
+        depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
-comment "Memory hotplug is currently incompatible with Software Suspend"
-        depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390
 config MEMORY_HOTPLUG_SPARSE
        def_bool y
@@ -224,7 +221,9 @@ config KSM
          the many instances by a single resident page with that content, so
          saving memory until one or another app needs to modify the content.
          Recommended for use with KVM, or with other duplicative applications.
-          See Documentation/vm/ksm.txt for more information.
+          See Documentation/vm/ksm.txt for more information: KSM is inactive
+          until a program has madvised that an area is MADV_MERGEABLE, and
+          root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
 config DEFAULT_MMAP_MIN_ADDR
        int "Low address space to protect from user allocation"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3d3accb1f800..67a33a5a1a93 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -92,7 +92,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
                   "BdiDirtyThresh:   %8lu kB\n"
                   "DirtyThresh:      %8lu kB\n"
                   "BackgroundThresh: %8lu kB\n"
-                   "WriteBack threads:%8lu\n"
+                   "WritebackThreads: %8lu\n"
                   "b_dirty:          %8lu\n"
                   "b_io:             %8lu\n"
                   "b_more_io:        %8lu\n"
@@ -604,15 +604,36 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
        /*
         * Finally, kill the kernel threads. We don't need to be RCU
-         * safe anymore, since the bdi is gone from visibility.
+         * safe anymore, since the bdi is gone from visibility. Force
+         * unfreeze of the thread before calling kthread_stop(), otherwise
+         * it would never exet if it is currently stuck in the refrigerator.
         */
-        list_for_each_entry(wb, &bdi->wb_list, list)
+        list_for_each_entry(wb, &bdi->wb_list, list) {
+                wb->task->flags &= ~PF_FROZEN;
                kthread_stop(wb->task);
+        }
+}
+/*
+ * This bdi is going away now, make sure that no super_blocks point to it
+ */
+static void bdi_prune_sb(struct backing_dev_info *bdi)
+{
+        struct super_block *sb;
+        spin_lock(&sb_lock);
+        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (sb->s_bdi == bdi)
+                        sb->s_bdi = NULL;
+        }
+        spin_unlock(&sb_lock);
 }
 void bdi_unregister(struct backing_dev_info *bdi)
 {
        if (bdi->dev) {
+                bdi_prune_sb(bdi);
                if (!bdi_cap_flush_forker(bdi))
                        bdi_wb_shutdown(bdi);
                bdi_debug_unregister(bdi);
diff --git a/mm/highmem.c b/mm/highmem.c
index 25878cc49daa..9c1e627f282e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -426,16 +426,21 @@ void __init page_address_init(void)
 void debug_kmap_atomic(enum km_type type)
 {
-        static unsigned warn_count = 10;
+        static int warn_count = 10;
-        if (unlikely(warn_count == 0))
+        if (unlikely(warn_count < 0))
                return;
        if (unlikely(in_interrupt())) {
-                if (in_irq()) {
+                if (in_nmi()) {
+                        if (type != KM_NMI && type != KM_NMI_PTE) {
+                                WARN_ON(1);
+                                warn_count--;
+                        }
+                } else if (in_irq()) {
                        if (type != KM_IRQ0 && type != KM_IRQ1 &&
                            type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
-                            type != KM_BOUNCE_READ) {
+                            type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
                                WARN_ON(1);
                                warn_count--;
                        }
@@ -452,7 +457,9 @@ void debug_kmap_atomic(enum km_type type)
        }
        if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
-                        type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+                        type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
+                        type == KM_IRQ_PTE || type == KM_NMI ||
+                        type == KM_NMI_PTE ) {
                if (!irqs_disabled()) {
                        WARN_ON(1);
                        warn_count--;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 4ea4510e2996..8bf765c4f58d 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -833,12 +833,15 @@ static void early_alloc(struct early_log *log)
         */
        rcu_read_lock();
        object = create_object((unsigned long)log->ptr, log->size,
-                               log->min_count, GFP_KERNEL);
+                               log->min_count, GFP_ATOMIC);
+        if (!object)
+                goto out;
        spin_lock_irqsave(&object->lock, flags);
        for (i = 0; i < log->trace_len; i++)
                object->trace[i] = log->trace[i];
        object->trace_len = log->trace_len;
        spin_unlock_irqrestore(&object->lock, flags);
+out:
        rcu_read_unlock();
 }
diff --git a/mm/ksm.c b/mm/ksm.c
index f7edac356f46..5575f8628fef 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -184,11 +184,6 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock);
                sizeof(struct __struct), __alignof__(struct __struct),\
                (__flags), NULL)
-static void __init ksm_init_max_kernel_pages(void)
-{
-        ksm_max_kernel_pages = nr_free_buffer_pages() / 4;
-}
 static int __init ksm_slab_init(void)
 {
        rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
@@ -1017,6 +1012,7 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page,
                struct rmap_item *tree_rmap_item;
                int ret;
+                cond_resched();
                tree_rmap_item = rb_entry(*new, struct rmap_item, node);
                page2[0] = get_mergeable_page(tree_rmap_item);
                if (!page2[0])
@@ -1673,7 +1669,7 @@ static int __init ksm_init(void)
        struct task_struct *ksm_thread;
        int err;
-        ksm_init_max_kernel_pages();
+        ksm_max_kernel_pages = totalram_pages / 4;
        err = ksm_slab_init();
        if (err)
@@ -1697,6 +1693,9 @@ static int __init ksm_init(void)
                kthread_stop(ksm_thread);
                goto out_free2;
        }
+#else
+        ksm_run = KSM_RUN_MERGE;        /* no way for user to start it */
 #endif /* CONFIG_SYSFS */
        return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e2b98a6875c0..f99f5991d6bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -313,7 +313,8 @@ soft_limit_tree_from_page(struct page *page)
 static void
 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
                                struct mem_cgroup_per_zone *mz,
-                                struct mem_cgroup_tree_per_zone *mctz)
+                                struct mem_cgroup_tree_per_zone *mctz,
+                                unsigned long long new_usage_in_excess)
 {
        struct rb_node **p = &mctz->rb_root.rb_node;
        struct rb_node *parent = NULL;
@@ -322,7 +323,9 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
        if (mz->on_tree)
                return;
-        mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
+        mz->usage_in_excess = new_usage_in_excess;
+        if (!mz->usage_in_excess)
+                return;
        while (*p) {
                parent = *p;
                mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
@@ -353,16 +356,6 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 }
 static void
-mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
-                                struct mem_cgroup_per_zone *mz,
-                                struct mem_cgroup_tree_per_zone *mctz)
-{
-        spin_lock(&mctz->lock);
-        __mem_cgroup_insert_exceeded(mem, mz, mctz);
-        spin_unlock(&mctz->lock);
-}
-static void
 mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
                                struct mem_cgroup_per_zone *mz,
                                struct mem_cgroup_tree_per_zone *mctz)
@@ -392,34 +385,36 @@ static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
 {
-        unsigned long long prev_usage_in_excess, new_usage_in_excess;
+        unsigned long long excess;
-        bool updated_tree = false;
        struct mem_cgroup_per_zone *mz;
        struct mem_cgroup_tree_per_zone *mctz;
+        int nid = page_to_nid(page);
-        mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page));
+        int zid = page_zonenum(page);
        mctz = soft_limit_tree_from_page(page);
        /*
-         * We do updates in lazy mode, mem's are removed
+         * Necessary to update all ancestors when hierarchy is used.
-         * lazily from the per-zone, per-node rb tree
+         * because their event counter is not touched.
         */
-        prev_usage_in_excess = mz->usage_in_excess;
+        for (; mem; mem = parent_mem_cgroup(mem)) {
+                mz = mem_cgroup_zoneinfo(mem, nid, zid);
-        new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
+                excess = res_counter_soft_limit_excess(&mem->res);
-        if (prev_usage_in_excess) {
+                /*
-                mem_cgroup_remove_exceeded(mem, mz, mctz);
+                 * We have to update the tree if mz is on RB-tree or
-                updated_tree = true;
+                 * mem is over its softlimit.
-        }
+                 */
-        if (!new_usage_in_excess)
+                if (excess || mz->on_tree) {
-                goto done;
+                        spin_lock(&mctz->lock);
-        mem_cgroup_insert_exceeded(mem, mz, mctz);
+                        /* if on-tree, remove it */
+                        if (mz->on_tree)
-done:
+                                __mem_cgroup_remove_exceeded(mem, mz, mctz);
-        if (updated_tree) {
+                        /*
-                spin_lock(&mctz->lock);
+                         * Insert again. mz->usage_in_excess will be updated.
-                mz->usage_in_excess = new_usage_in_excess;
+                         * If excess is 0, no tree ops.
-                spin_unlock(&mctz->lock);
+                         */
+                        __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
+                        spin_unlock(&mctz->lock);
+                }
        }
 }
@@ -447,9 +442,10 @@ static struct mem_cgroup_per_zone *
 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
        struct rb_node *rightmost = NULL;
-        struct mem_cgroup_per_zone *mz = NULL;
+        struct mem_cgroup_per_zone *mz;
 retry:
+        mz = NULL;
        rightmost = rb_last(&mctz->rb_root);
        if (!rightmost)
                goto done;              /* Nothing to reclaim from */
@@ -1270,9 +1266,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                        gfp_t gfp_mask, struct mem_cgroup **memcg,
                        bool oom, struct page *page)
 {
-        struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
+        struct mem_cgroup *mem, *mem_over_limit;
        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-        struct res_counter *fail_res, *soft_fail_res = NULL;
+        struct res_counter *fail_res;
        if (unlikely(test_thread_flag(TIF_MEMDIE))) {
                /* Don't account this! */
@@ -1304,17 +1300,16 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                if (mem_cgroup_is_root(mem))
                        goto done;
-                ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
+                ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
-                                                &soft_fail_res);
                if (likely(!ret)) {
                        if (!do_swap_account)
                                break;
                        ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
-                                                        &fail_res, NULL);
+                                                        &fail_res);
                        if (likely(!ret))
                                break;
                        /* mem+swap counter fails */
-                        res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+                        res_counter_uncharge(&mem->res, PAGE_SIZE);
                        flags |= MEM_CGROUP_RECLAIM_NOSWAP;
                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
                                                                        memsw);
@@ -1353,16 +1348,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                }
        }
        /*
-         * Insert just the ancestor, we should trickle down to the correct
+         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
-         * cgroup for reclaim, since the other nodes will be below their
+         * if they exceeds softlimit.
-         * soft limit
         */
-        if (soft_fail_res) {
+        if (mem_cgroup_soft_limit_check(mem))
-                mem_over_soft_limit =
+                mem_cgroup_update_tree(mem, page);
-                        mem_cgroup_from_res_counter(soft_fail_res, res);
-                if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
-                        mem_cgroup_update_tree(mem_over_soft_limit, page);
-        }
 done:
        return 0;
 nomem:
@@ -1437,10 +1427,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
        if (unlikely(PageCgroupUsed(pc))) {
                unlock_page_cgroup(pc);
                if (!mem_cgroup_is_root(mem)) {
-                        res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+                        res_counter_uncharge(&mem->res, PAGE_SIZE);
                        if (do_swap_account)
-                                res_counter_uncharge(&mem->memsw, PAGE_SIZE,
+                                res_counter_uncharge(&mem->memsw, PAGE_SIZE);
-                                                        NULL);
                }
                css_put(&mem->css);
                return;
@@ -1519,7 +1508,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
                goto out;
        if (!mem_cgroup_is_root(from))
-                res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
+                res_counter_uncharge(&from->res, PAGE_SIZE);
        mem_cgroup_charge_statistics(from, pc, false);
        page = pc->page;
@@ -1539,7 +1528,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
        }
        if (do_swap_account && !mem_cgroup_is_root(from))
-                res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
+                res_counter_uncharge(&from->memsw, PAGE_SIZE);
        css_put(&from->css);
        css_get(&to->css);
@@ -1610,9 +1599,9 @@ uncharge:
        css_put(&parent->css);
        /* uncharge if move fails */
        if (!mem_cgroup_is_root(parent)) {
-                res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
+                res_counter_uncharge(&parent->res, PAGE_SIZE);
                if (do_swap_account)
-                        res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
+                        res_counter_uncharge(&parent->memsw, PAGE_SIZE);
        }
        return ret;
 }
@@ -1803,8 +1792,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
                         * calling css_tryget
                         */
                        if (!mem_cgroup_is_root(memcg))
-                                res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
+                                res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
-                                                        NULL);
                        mem_cgroup_swap_statistics(memcg, false);
                        mem_cgroup_put(memcg);
                }
@@ -1831,9 +1819,9 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
        if (!mem)
                return;
        if (!mem_cgroup_is_root(mem)) {
-                res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+                res_counter_uncharge(&mem->res, PAGE_SIZE);
                if (do_swap_account)
-                        res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+                        res_counter_uncharge(&mem->memsw, PAGE_SIZE);
        }
        css_put(&mem->css);
 }
@@ -1848,7 +1836,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
        struct mem_cgroup_per_zone *mz;
-        bool soft_limit_excess = false;
        if (mem_cgroup_disabled())
                return NULL;
@@ -1888,10 +1875,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        }
        if (!mem_cgroup_is_root(mem)) {
-                res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
+                res_counter_uncharge(&mem->res, PAGE_SIZE);
                if (do_swap_account &&
                                (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
-                        res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+                        res_counter_uncharge(&mem->memsw, PAGE_SIZE);
        }
        if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
                mem_cgroup_swap_statistics(mem, true);
@@ -1908,7 +1895,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        mz = page_cgroup_zoneinfo(pc);
        unlock_page_cgroup(pc);
-        if (soft_limit_excess && mem_cgroup_soft_limit_check(mem))
+        if (mem_cgroup_soft_limit_check(mem))
                mem_cgroup_update_tree(mem, page);
        /* at swapout, this memcg will be accessed to record to swap */
        if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
@@ -1986,7 +1973,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
                 * This memcg can be obsolete one. We avoid calling css_tryget
                 */
                if (!mem_cgroup_is_root(memcg))
-                        res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
+                        res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
                mem_cgroup_swap_statistics(memcg, false);
                mem_cgroup_put(memcg);
        }
@@ -2233,6 +2220,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
        unsigned long reclaimed;
        int loop = 0;
        struct mem_cgroup_tree_per_zone *mctz;
+        unsigned long long excess;
        if (order > 0)
                return 0;
@@ -2284,9 +2272,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                                        break;
                        } while (1);
                }
-                mz->usage_in_excess =
-                        res_counter_soft_limit_excess(&mz->mem->res);
                __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+                excess = res_counter_soft_limit_excess(&mz->mem->res);
                /*
                 * One school of thought says that we should not add
                 * back the node to the tree if reclaim returns 0.
@@ -2295,8 +2282,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                 * memory to reclaim from. Consider this as a longer
                 * term TODO.
                 */
-                if (mz->usage_in_excess)
+                /* If excess == 0, no tree ops */
-                        __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
+                __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
                spin_unlock(&mctz->lock);
                css_put(&mz->mem->css);
                loop++;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 729d4b15b645..dacc64183874 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -35,6 +35,7 @@
 #include <linux/mm.h>
 #include <linux/page-flags.h>
 #include <linux/sched.h>
+#include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
@@ -370,9 +371,6 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
        int ret = FAILED;
        struct address_space *mapping;
-        if (!isolate_lru_page(p))
-                page_cache_release(p);
        /*
         * For anonymous pages we're done the only reference left
         * should be the one m_f() holds.
@@ -498,30 +496,18 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
 */
 static int me_swapcache_dirty(struct page *p, unsigned long pfn)
 {
-        int ret = FAILED;
        ClearPageDirty(p);
        /* Trigger EIO in shmem: */
        ClearPageUptodate(p);
-        if (!isolate_lru_page(p)) {
+        return DELAYED;
-                page_cache_release(p);
-                ret = DELAYED;
-        }
-        return ret;
 }
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
 {
-        int ret = FAILED;
-        if (!isolate_lru_page(p)) {
-                page_cache_release(p);
-                ret = RECOVERED;
-        }
        delete_from_swap_cache(p);
-        return ret;
+        return RECOVERED;
 }
 /*
@@ -611,8 +597,6 @@ static struct page_state {
        { 0,            0,              "unknown page state",   me_unknown },
 };
-#undef lru
 static void action_result(unsigned long pfn, char *msg, int result)
 {
        struct page *page = NULL;
@@ -629,13 +613,16 @@ static int page_action(struct page_state *ps, struct page *p,
                        unsigned long pfn, int ref)
 {
        int result;
+        int count;
        result = ps->action(p, pfn);
        action_result(pfn, ps->msg, result);
-        if (page_count(p) != 1 + ref)
+        count = page_count(p) - 1 - ref;
+        if (count != 0)
                printk(KERN_ERR
                       "MCE %#lx: %s page still referenced by %d users\n",
-                       pfn, ps->msg, page_count(p) - 1);
+                       pfn, ps->msg, count);
        /* Could do more checks here if page looks ok */
        /*
@@ -661,12 +648,9 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
        int i;
        int kill = 1;
-        if (PageReserved(p) || PageCompound(p) || PageSlab(p))
+        if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
                return;
-        if (!PageLRU(p))
-                lru_add_drain_all();
        /*
         * This check implies we don't kill processes if their pages
         * are in the swap cache early. Those are always late kills.
@@ -738,6 +722,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
 int __memory_failure(unsigned long pfn, int trapno, int ref)
 {
+        unsigned long lru_flag;
        struct page_state *ps;
        struct page *p;
        int res;
@@ -775,6 +760,24 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
        }
        /*
+         * We ignore non-LRU pages for good reasons.
+         * - PG_locked is only well defined for LRU pages and a few others
+         * - to avoid races with __set_page_locked()
+         * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
+         * The check (unnecessarily) ignores LRU pages being isolated and
+         * walked by the page reclaim code, however that's not a big loss.
+         */
+        if (!PageLRU(p))
+                lru_add_drain_all();
+        lru_flag = p->flags & lru;
+        if (isolate_lru_page(p)) {
+                action_result(pfn, "non LRU", IGNORED);
+                put_page(p);
+                return -EBUSY;
+        }
+        page_cache_release(p);
+        /*
         * Lock the page and wait for writeback to finish.
         * It's very difficult to mess with pages currently under IO
         * and in many cases impossible, so we just avoid it here.
@@ -790,7 +793,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
        /*
         * Torn down by someone else?
         */
-        if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
+        if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
                action_result(pfn, "already truncated LRU", IGNORED);
                res = 0;
                goto out;
@@ -798,7 +801,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
        res = -EBUSY;
        for (ps = error_states;; ps++) {
-                if ((p->flags & ps->mask) == ps->res) {
+                if (((p->flags | lru_flag)& ps->mask) == ps->res) {
                        res = page_action(ps, p, pfn, ref);
                        break;
                }
diff --git a/mm/memory.c b/mm/memory.c
index 7e91b5f9f690..6ab19dd4a199 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -641,6 +641,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
                unsigned long addr, unsigned long end)
 {
+        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
        spinlock_t *src_ptl, *dst_ptl;
        int progress = 0;
@@ -654,6 +655,8 @@ again:
        src_pte = pte_offset_map_nested(src_pmd, addr);
        src_ptl = pte_lockptr(src_mm, src_pmd);
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+        orig_src_pte = src_pte;
+        orig_dst_pte = dst_pte;
        arch_enter_lazy_mmu_mode();
        do {
@@ -677,9 +680,9 @@ again:
        arch_leave_lazy_mmu_mode();
        spin_unlock(src_ptl);
-        pte_unmap_nested(src_pte - 1);
+        pte_unmap_nested(orig_src_pte);
        add_mm_rss(dst_mm, rss[0], rss[1]);
-        pte_unmap_unlock(dst_pte - 1, dst_ptl);
+        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();
        if (addr != end)
                goto again;
@@ -1820,10 +1823,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
        token = pmd_pgtable(*pmd);
        do {
-                err = fn(pte, token, addr, data);
+                err = fn(pte++, token, addr, data);
                if (err)
                        break;
-        } while (pte++, addr += PAGE_SIZE, addr != end);
+        } while (addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
@@ -2539,7 +2542,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        } else if (PageHWPoison(page)) {
                ret = VM_FAULT_HWPOISON;
                delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
-                goto out;
+                goto out_release;
        }
        lock_page(page);
@@ -2611,6 +2614,7 @@ out_nomap:
        pte_unmap_unlock(page_table, ptl);
 out_page:
        unlock_page(page);
+out_release:
        page_cache_release(page);
        return ret;
 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 821dee596377..2047465cd27c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,7 @@
 #include <linux/migrate.h>
 #include <linux/page-isolation.h>
 #include <linux/pfn.h>
+#include <linux/suspend.h>
 #include <asm/tlbflush.h>
@@ -447,7 +448,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 }
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
-static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 {
        struct pglist_data *pgdat;
        unsigned long zones_size[MAX_NR_ZONES] = {0};
@@ -484,14 +486,18 @@ int __ref add_memory(int nid, u64 start, u64 size)
        struct resource *res;
        int ret;
+        lock_system_sleep();
        res = register_memory_resource(start, size);
+        ret = -EEXIST;
        if (!res)
-                return -EEXIST;
+                goto out;
        if (!node_online(nid)) {
                pgdat = hotadd_new_pgdat(nid, start);
+                ret = -ENOMEM;
                if (!pgdat)
-                        return -ENOMEM;
+                        goto out;
                new_pgdat = 1;
        }
@@ -514,7 +520,8 @@ int __ref add_memory(int nid, u64 start, u64 size)
                BUG_ON(ret);
        }
-        return ret;
+        goto out;
 error:
        /* rollback pgdat allocation and others */
        if (new_pgdat)
@@ -522,6 +529,8 @@ error:
        if (res)
                release_memory_resource(res);
+out:
+        unlock_system_sleep();
        return ret;
 }
 EXPORT_SYMBOL_GPL(add_memory);
@@ -758,6 +767,8 @@ int offline_pages(unsigned long start_pfn,
        if (!test_pages_in_a_zone(start_pfn, end_pfn))
                return -EINVAL;
+        lock_system_sleep();
        zone = page_zone(pfn_to_page(start_pfn));
        node = zone_to_nid(zone);
        nr_pages = end_pfn - start_pfn;
@@ -765,7 +776,7 @@ int offline_pages(unsigned long start_pfn,
        /* set above range as isolated */
        ret = start_isolate_page_range(start_pfn, end_pfn);
        if (ret)
-                return ret;
+                goto out;
        arg.start_pfn = start_pfn;
        arg.nr_pages = nr_pages;
@@ -843,6 +854,7 @@ repeat:
        writeback_set_ratelimit();
        memory_notify(MEM_OFFLINE, &arg);
+        unlock_system_sleep();
        return 0;
 failed_removal:
@@ -852,6 +864,8 @@ failed_removal:
        /* pushback to free area */
        undo_isolate_page_range(start_pfn, end_pfn);
+out:
+        unlock_system_sleep();
        return ret;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7dd9d9f80694..4545d5944243 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1024,7 +1024,7 @@ static long do_mbind(unsigned long start, unsigned long len,
                err = migrate_prep();
                if (err)
-                        return err;
+                        goto mpol_out;
        }
        {
                NODEMASK_SCRATCH(scratch);
@@ -1039,10 +1039,9 @@ static long do_mbind(unsigned long start, unsigned long len,
                        err = -ENOMEM;
                NODEMASK_SCRATCH_FREE(scratch);
        }
-        if (err) {
+        if (err)
-                mpol_put(new);
+                goto mpol_out;
-                return err;
-        }
        vma = check_range(mm, start, end, nmask,
                          flags | MPOL_MF_INVERT, &pagelist);
@@ -1058,9 +1057,11 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
-        }
+        } else
+                putback_lru_pages(&pagelist);
        up_write(&mm->mmap_sem);
+ mpol_out:
        mpol_put(new);
        return err;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 1a4bf4813780..7dbcb22316d2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -602,7 +602,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        struct page *newpage = get_new_page(page, private, &result);
        int rcu_locked = 0;
        int charge = 0;
-        struct mem_cgroup *mem;
+        struct mem_cgroup *mem = NULL;
        if (!newpage)
                return -ENOMEM;
diff --git a/mm/nommu.c b/mm/nommu.c
index 5189b5aed8c0..9876fa0c3ad3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1362,9 +1362,11 @@ share:
 error_just_free:
        up_write(&nommu_region_sem);
 error:
-        fput(region->vm_file);
+        if (region->vm_file)
+                fput(region->vm_file);
        kmem_cache_free(vm_region_jar, region);
-        fput(vma->vm_file);
+        if (vma->vm_file)
+                fput(vma->vm_file);
        if (vma->vm_flags & VM_EXECUTABLE)
                removed_exe_file_vma(vma->vm_mm);
        kmem_cache_free(vm_area_cachep, vma);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a3b14090b1fb..2c5d79236ead 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -566,7 +566,8 @@ static void balance_dirty_pages(struct address_space *mapping,
                if (pages_written >= write_chunk)
                        break;          /* We've done our duty */
-                schedule_timeout_interruptible(pause);
+                __set_current_state(TASK_INTERRUPTIBLE);
+                io_schedule_timeout(pause);
                /*
                 * Increase the delay for each loop, up to our previous
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bf720550b44d..2bc2ac63f41e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1769,7 +1769,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
                 */
                alloc_flags &= ~ALLOC_CPUSET;
-        } else if (unlikely(rt_task(p)))
+        } else if (unlikely(rt_task(p)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
        if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
@@ -1817,9 +1817,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;
+restart:
        wake_all_kswapd(order, zonelist, high_zoneidx);
-restart:
        /*
         * OK, we're below the kswapd watermark and have kicked background
         * reclaim. Now things get more complex, so set up alloc_flags according
@@ -2183,7 +2183,7 @@ void show_free_areas(void)
        printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
                " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
                " unevictable:%lu"
-                " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n"
+                " dirty:%lu writeback:%lu unstable:%lu\n"
                " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
                " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
                global_page_state(NR_ACTIVE_ANON),
@@ -2196,7 +2196,6 @@ void show_free_areas(void)
                global_page_state(NR_FILE_DIRTY),
                global_page_state(NR_WRITEBACK),
                global_page_state(NR_UNSTABLE_NFS),
-                nr_blockdev_pages(),
                global_page_state(NR_FREE_PAGES),
                global_page_state(NR_SLAB_RECLAIMABLE),
                global_page_state(NR_SLAB_UNRECLAIMABLE),
diff --git a/mm/percpu.c b/mm/percpu.c
index 43d8cacfdaa5..5adfc268b408 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -153,7 +153,10 @@ static int pcpu_reserved_chunk_limit;
 *
 * During allocation, pcpu_alloc_mutex is kept locked all the time and
 * pcpu_lock is grabbed and released as necessary.  All actual memory
- * allocations are done using GFP_KERNEL with pcpu_lock released.
+ * allocations are done using GFP_KERNEL with pcpu_lock released.  In
+ * general, percpu memory can't be allocated with irq off but
+ * irqsave/restore are still used in alloc path so that it can be used
+ * from early init path - sched_init() specifically.
 *
 * Free path accesses and alters only the index data structures, so it
 * can be safely called from atomic context.  When memory needs to be
@@ -352,62 +355,86 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 }
 /**
- * pcpu_extend_area_map - extend area map for allocation
+ * pcpu_need_to_extend - determine whether chunk area map needs to be extended
- * @chunk: target chunk
+ * @chunk: chunk of interest
 *
- * Extend area map of @chunk so that it can accomodate an allocation.
+ * Determine whether area map of @chunk needs to be extended to
- * A single allocation can split an area into three areas, so this
+ * accomodate a new allocation.
- * function makes sure that @chunk->map has at least two extra slots.
 *
 * CONTEXT:
- * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
+ * pcpu_lock.
- * if area map is extended.
 *
 * RETURNS:
- * 0 if noop, 1 if successfully extended, -errno on failure.
+ * New target map allocation length if extension is necessary, 0
+ * otherwise.
 */
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
+static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
 {
        int new_alloc;
-        int *new;
-        size_t size;
-        /* has enough? */
        if (chunk->map_alloc >= chunk->map_used + 2)
                return 0;
-        spin_unlock_irq(&pcpu_lock);
        new_alloc = PCPU_DFL_MAP_ALLOC;
        while (new_alloc < chunk->map_used + 2)
                new_alloc *= 2;
-        new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
+        return new_alloc;
-        if (!new) {
+}
-                spin_lock_irq(&pcpu_lock);
+/**
+ * pcpu_extend_area_map - extend area map of a chunk
+ * @chunk: chunk of interest
+ * @new_alloc: new target allocation length of the area map
+ *
+ * Extend area map of @chunk to have @new_alloc entries.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.  Grabs and releases pcpu_lock.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+{
+        int *old = NULL, *new = NULL;
+        size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
+        unsigned long flags;
+        new = pcpu_mem_alloc(new_size);
+        if (!new)
                return -ENOMEM;
-        }
-        /*
+        /* acquire pcpu_lock and switch to new area map */
-         * Acquire pcpu_lock and switch to new area map.  Only free
+        spin_lock_irqsave(&pcpu_lock, flags);
-         * could have happened inbetween, so map_used couldn't have
-         * grown.
+        if (new_alloc <= chunk->map_alloc)
-         */
+                goto out_unlock;
-        spin_lock_irq(&pcpu_lock);
-        BUG_ON(new_alloc < chunk->map_used + 2);
-        size = chunk->map_alloc * sizeof(chunk->map[0]);
+        old_size = chunk->map_alloc * sizeof(chunk->map[0]);
-        memcpy(new, chunk->map, size);
+        memcpy(new, chunk->map, old_size);
        /*
         * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
         * one of the first chunks and still using static map.
         */
        if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
-                pcpu_mem_free(chunk->map, size);
+                old = chunk->map;
        chunk->map_alloc = new_alloc;
        chunk->map = new;
+        new = NULL;
+out_unlock:
+        spin_unlock_irqrestore(&pcpu_lock, flags);
+        /*
+         * pcpu_mem_free() might end up calling vfree() which uses
+         * IRQ-unsafe lock and thus can't be called under pcpu_lock.
+         */
+        pcpu_mem_free(old, old_size);
+        pcpu_mem_free(new, new_size);
        return 0;
 }
@@ -1043,8 +1070,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 */
 static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
+        static int warn_limit = 10;
        struct pcpu_chunk *chunk;
-        int slot, off;
+        const char *err;
+        int slot, off, new_alloc;
+        unsigned long flags;
        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
                WARN(true, "illegal size (%zu) or align (%zu) for "
@@ -1053,17 +1083,31 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
        }
        mutex_lock(&pcpu_alloc_mutex);
-        spin_lock_irq(&pcpu_lock);
+        spin_lock_irqsave(&pcpu_lock, flags);
        /* serve reserved allocations from the reserved chunk if available */
        if (reserved && pcpu_reserved_chunk) {
                chunk = pcpu_reserved_chunk;
-                if (size > chunk->contig_hint ||
-                    pcpu_extend_area_map(chunk) < 0)
+                if (size > chunk->contig_hint) {
+                        err = "alloc from reserved chunk failed";
                        goto fail_unlock;
+                }
+                while ((new_alloc = pcpu_need_to_extend(chunk))) {
+                        spin_unlock_irqrestore(&pcpu_lock, flags);
+                        if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
+                                err = "failed to extend area map of reserved chunk";
+                                goto fail_unlock_mutex;
+                        }
+                        spin_lock_irqsave(&pcpu_lock, flags);
+                }
                off = pcpu_alloc_area(chunk, size, align);
                if (off >= 0)
                        goto area_found;
+                err = "alloc from reserved chunk failed";
                goto fail_unlock;
        }
@@ -1074,13 +1118,20 @@ restart:
                        if (size > chunk->contig_hint)
                                continue;
-                        switch (pcpu_extend_area_map(chunk)) {
+                        new_alloc = pcpu_need_to_extend(chunk);
-                        case 0:
+                        if (new_alloc) {
-                                break;
+                                spin_unlock_irqrestore(&pcpu_lock, flags);
-                        case 1:
+                                if (pcpu_extend_area_map(chunk,
-                                goto restart;   /* pcpu_lock dropped, restart */
+                                                         new_alloc) < 0) {
-                        default:
+                                        err = "failed to extend area map";
-                                goto fail_unlock;
+                                        goto fail_unlock_mutex;
+                                }
+                                spin_lock_irqsave(&pcpu_lock, flags);
+                                /*
+                                 * pcpu_lock has been dropped, need to
+                                 * restart cpu_slot list walking.
+                                 */
+                                goto restart;
                        }
                        off = pcpu_alloc_area(chunk, size, align);
@@ -1090,23 +1141,26 @@ restart:
        }
        /* hmmm... no space left, create a new chunk */
-        spin_unlock_irq(&pcpu_lock);
+        spin_unlock_irqrestore(&pcpu_lock, flags);
        chunk = alloc_pcpu_chunk();
-        if (!chunk)
+        if (!chunk) {
+                err = "failed to allocate new chunk";
                goto fail_unlock_mutex;
+        }
-        spin_lock_irq(&pcpu_lock);
+        spin_lock_irqsave(&pcpu_lock, flags);
        pcpu_chunk_relocate(chunk, -1);
        goto restart;
 area_found:
-        spin_unlock_irq(&pcpu_lock);
+        spin_unlock_irqrestore(&pcpu_lock, flags);
        /* populate, map and clear the area */
        if (pcpu_populate_chunk(chunk, off, size)) {
-                spin_lock_irq(&pcpu_lock);
+                spin_lock_irqsave(&pcpu_lock, flags);
                pcpu_free_area(chunk, off);
+                err = "failed to populate";
                goto fail_unlock;
        }
@@ -1116,9 +1170,16 @@ area_found:
        return __addr_to_pcpu_ptr(chunk->base_addr + off);
 fail_unlock:
-        spin_unlock_irq(&pcpu_lock);
+        spin_unlock_irqrestore(&pcpu_lock, flags);
 fail_unlock_mutex:
        mutex_unlock(&pcpu_alloc_mutex);
+        if (warn_limit) {
+                pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "
+                           "%s\n", size, align, err);
+                dump_stack();
+                if (!--warn_limit)
+                        pr_info("PERCPU: limit reached, disable warning\n");
+        }
        return NULL;
 }
@@ -1347,6 +1408,10 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info(
        struct pcpu_alloc_info *ai;
        unsigned int *cpu_map;
+        /* this function may be called multiple times */
+        memset(group_map, 0, sizeof(group_map));
+        memset(group_cnt, 0, sizeof(group_map));
        /*
         * Determine min_unit_size, alloc_size and max_upa such that
         * alloc_size is multiple of atom_size and is the smallest
@@ -1574,6 +1639,7 @@ static void pcpu_dump_alloc_info(const char *lvl,
 int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                                  void *base_addr)
 {
+        static char cpus_buf[4096] __initdata;
        static int smap[2], dmap[2];
        size_t dyn_size = ai->dyn_size;
        size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
@@ -1585,17 +1651,26 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        int *unit_map;
        int group, unit, i;
+        cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
+#define PCPU_SETUP_BUG_ON(cond) do {                                    \
+        if (unlikely(cond)) {                                           \
+                pr_emerg("PERCPU: failed to initialize, %s", #cond);    \
+                pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf);   \
+                pcpu_dump_alloc_info(KERN_EMERG, ai);                   \
+                BUG();                                                  \
+        }                                                               \
+} while (0)
        /* sanity checks */
        BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
                     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
-        BUG_ON(ai->nr_groups <= 0);
+        PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
-        BUG_ON(!ai->static_size);
+        PCPU_SETUP_BUG_ON(!ai->static_size);
-        BUG_ON(!base_addr);
+        PCPU_SETUP_BUG_ON(!base_addr);
-        BUG_ON(ai->unit_size < size_sum);
+        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
-        BUG_ON(ai->unit_size & ~PAGE_MASK);
+        PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
-        BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+        PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
-        pcpu_dump_alloc_info(KERN_DEBUG, ai);
        /* process group information and build config tables accordingly */
        group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
@@ -1604,7 +1679,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
        for (cpu = 0; cpu < nr_cpu_ids; cpu++)
-                unit_map[cpu] = NR_CPUS;
+                unit_map[cpu] = UINT_MAX;
        pcpu_first_unit_cpu = NR_CPUS;
        for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
@@ -1618,8 +1693,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                        if (cpu == NR_CPUS)
                                continue;
-                        BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu));
+                        PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);
-                        BUG_ON(unit_map[cpu] != NR_CPUS);
+                        PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
+                        PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
                        unit_map[cpu] = unit + i;
                        unit_off[cpu] = gi->base_offset + i * ai->unit_size;
@@ -1632,7 +1708,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        pcpu_nr_units = unit;
        for_each_possible_cpu(cpu)
-                BUG_ON(unit_map[cpu] == NR_CPUS);
+                PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
+        /* we're done parsing the input, undefine BUG macro and dump config */
+#undef PCPU_SETUP_BUG_ON
+        pcpu_dump_alloc_info(KERN_INFO, ai);
        pcpu_nr_groups = ai->nr_groups;
        pcpu_group_offsets = group_offsets;
@@ -1782,7 +1862,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
        void *base = (void *)ULONG_MAX;
        void **areas = NULL;
        struct pcpu_alloc_info *ai;
-        size_t size_sum, areas_size;
+        size_t size_sum, areas_size, max_distance;
        int group, i, rc;
        ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
@@ -1832,8 +1912,25 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
        }
        /* base address is now known, determine group base offsets */
-        for (group = 0; group < ai->nr_groups; group++)
+        max_distance = 0;
+        for (group = 0; group < ai->nr_groups; group++) {
                ai->groups[group].base_offset = areas[group] - base;
+                max_distance = max_t(size_t, max_distance,
+                                     ai->groups[group].base_offset);
+        }
+        max_distance += ai->unit_size;
+        /* warn if maximum distance is further than 75% of vmalloc space */
+        if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
+                pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
+                           "space 0x%lx\n",
+                           max_distance, VMALLOC_END - VMALLOC_START);
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+                /* and fail if we have fallback */
+                rc = -EINVAL;
+                goto out_free;
+#endif
+        }
        pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
                PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
diff --git a/mm/rmap.c b/mm/rmap.c
index 28aafe2b5306..dd43373a483f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -242,8 +242,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 }
 /*
- * At what user virtual address is page expected in vma? checking that the
+ * At what user virtual address is page expected in vma?
- * page matches the vma: currently only used on anon pages, by unuse_vma;
+ * checking that the page matches the vma.
 */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4de7f02f820b..9c590eef7912 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1151,8 +1151,7 @@ static int try_to_unuse(unsigned int type)
                                } else
                                        retval = unuse_mm(mm, entry, page);
-                                if (set_start_mm &&
+                                if (set_start_mm && *swap_map < swcount) {
-                                    swap_count(*swap_map) < swcount) {
                                        mmput(new_start_mm);
                                        atomic_inc(&mm->mm_users);
                                        new_start_mm = mm;
@@ -1974,12 +1973,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                goto bad_swap;
        }
-        if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+        if (p->bdev) {
-                p->flags |= SWP_SOLIDSTATE;
+                if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
-                p->cluster_next = 1 + (random32() % p->highest_bit);
+                        p->flags |= SWP_SOLIDSTATE;
+                        p->cluster_next = 1 + (random32() % p->highest_bit);
+                }
+                if (discard_swap(p) == 0)
+                        p->flags |= SWP_DISCARDABLE;
        }
-        if (discard_swap(p) == 0)
-                p->flags |= SWP_DISCARDABLE;
        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 69511e663234..0f551a4a44cd 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/highmem.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
@@ -25,10 +26,10 @@
 #include <linux/rcupdate.h>
 #include <linux/pfn.h>
 #include <linux/kmemleak.h>
-#include <linux/highmem.h>
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
+#include <asm/shmparam.h>
 /*** Page table manipulation functions ***/
@@ -1156,12 +1157,11 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 }
 static struct vm_struct *__get_vm_area_node(unsigned long size,
-                unsigned long flags, unsigned long start, unsigned long end,
+                unsigned long align, unsigned long flags, unsigned long start,
-                int node, gfp_t gfp_mask, void *caller)
+                unsigned long end, int node, gfp_t gfp_mask, void *caller)
 {
        static struct vmap_area *va;
        struct vm_struct *area;
-        unsigned long align = 1;
        BUG_ON(in_interrupt());
        if (flags & VM_IOREMAP) {
@@ -1201,7 +1201,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
                                unsigned long start, unsigned long end)
 {
-        return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
+        return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
                                                __builtin_return_address(0));
 }
 EXPORT_SYMBOL_GPL(__get_vm_area);
@@ -1210,7 +1210,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
                                       unsigned long start, unsigned long end,
                                       void *caller)
 {
-        return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
+        return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
                                  caller);
 }
@@ -1225,22 +1225,22 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
 */
 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
 {
-        return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
                                -1, GFP_KERNEL, __builtin_return_address(0));
 }
 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
                                void *caller)
 {
-        return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
                                                -1, GFP_KERNEL, caller);
 }
 struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
                                   int node, gfp_t gfp_mask)
 {
-        return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
+        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-                                  gfp_mask, __builtin_return_address(0));
+                                  node, gfp_mask, __builtin_return_address(0));
 }
 static struct vm_struct *find_vm_area(const void *addr)
@@ -1403,7 +1403,8 @@ void *vmap(struct page **pages, unsigned int count,
 }
 EXPORT_SYMBOL(vmap);
-static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+                            gfp_t gfp_mask, pgprot_t prot,
                            int node, void *caller);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                                 pgprot_t prot, int node, void *caller)
@@ -1417,7 +1418,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        area->nr_pages = nr_pages;
        /* Please note that the recursion is strictly bounded. */
        if (array_size > PAGE_SIZE) {
-                pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO,
+                pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO,
                                PAGE_KERNEL, node, caller);
                area->flags |= VM_VPAGES;
        } else {
@@ -1476,6 +1477,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 /**
 *      __vmalloc_node  -  allocate virtually contiguous memory
 *      @size:          allocation size
+ *      @align:         desired alignment
 *      @gfp_mask:      flags for the page level allocator
 *      @prot:          protection mask for the allocated pages
 *      @node:          node to use for allocation or -1
@@ -1485,8 +1487,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 *      allocator with @gfp_mask flags.  Map them into contiguous
 *      kernel virtual space, using a pagetable protection of @prot.
 */
-static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+static void *__vmalloc_node(unsigned long size, unsigned long align,
-                                                int node, void *caller)
+                            gfp_t gfp_mask, pgprot_t prot,
+                            int node, void *caller)
 {
        struct vm_struct *area;
        void *addr;
@@ -1496,8 +1499,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
        if (!size || (size >> PAGE_SHIFT) > totalram_pages)
                return NULL;
-        area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
+        area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
-                                                node, gfp_mask, caller);
+                                  VMALLOC_END, node, gfp_mask, caller);
        if (!area)
                return NULL;
@@ -1516,7 +1519,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
-        return __vmalloc_node(size, gfp_mask, prot, -1,
+        return __vmalloc_node(size, 1, gfp_mask, prot, -1,
                                __builtin_return_address(0));
 }
 EXPORT_SYMBOL(__vmalloc);
@@ -1532,7 +1535,7 @@ EXPORT_SYMBOL(__vmalloc);
 */
 void *vmalloc(unsigned long size)
 {
-        return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+        return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
                                        -1, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc);
@@ -1549,7 +1552,8 @@ void *vmalloc_user(unsigned long size)
        struct vm_struct *area;
        void *ret;
-        ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+        ret = __vmalloc_node(size, SHMLBA,
+                             GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
                             PAGE_KERNEL, -1, __builtin_return_address(0));
        if (ret) {
                area = find_vm_area(ret);
@@ -1572,7 +1576,7 @@ EXPORT_SYMBOL(vmalloc_user);
 */
 void *vmalloc_node(unsigned long size, int node)
 {
-        return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+        return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
                                        node, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc_node);
@@ -1595,7 +1599,7 @@ EXPORT_SYMBOL(vmalloc_node);
 void *vmalloc_exec(unsigned long size)
 {
-        return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
+        return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
                              -1, __builtin_return_address(0));
 }
@@ -1616,7 +1620,7 @@ void *vmalloc_exec(unsigned long size)
 */
 void *vmalloc_32(unsigned long size)
 {
-        return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL,
+        return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
                              -1, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc_32);
@@ -1633,7 +1637,7 @@ void *vmalloc_32_user(unsigned long size)
        struct vm_struct *area;
        void *ret;
-        ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
+        ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
                             -1, __builtin_return_address(0));
        if (ret) {
                area = find_vm_area(ret);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 64e438898832..777af57fd8c8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -544,6 +544,16 @@ redo:
                 */
                lru = LRU_UNEVICTABLE;
                add_page_to_unevictable_list(page);
+                /*
+                 * When racing with an mlock clearing (page is
+                 * unlocked), make sure that if the other thread does
+                 * not observe our setting of PG_lru and fails
+                 * isolation, we see PG_mlocked cleared below and move
+                 * the page back to the evictable list.
+                 *
+                 * The other side is TestClearPageMlocked().
+                 */
+                smp_mb();
        }
        /*
@@ -1088,7 +1098,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
        int lumpy_reclaim = 0;
        while (unlikely(too_many_isolated(zone, file, sc))) {
-                congestion_wait(WRITE, HZ/10);
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
                /* We are about to die and free our memory. Return now. */
                if (fatal_signal_pending(current))
@@ -1356,7 +1366,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                         * IO, plus JVM can create lots of anon VM_EXEC pages,
                         * so we ignore them here.
                         */
-                        if ((vm_flags & VM_EXEC) && !PageAnon(page)) {
+                        if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
                                list_add(&page->lru, &l_active);
                                continue;
                        }