10 files changed, 100 insertions, 62 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 57963c6063d1..fd3386242cf0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -67,7 +67,7 @@ config DISCONTIGMEM
 config SPARSEMEM
        def_bool y
-        depends on SPARSEMEM_MANUAL
+        depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
 config FLATMEM
        def_bool y
@@ -129,7 +129,7 @@ config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
        depends on SPARSEMEM || X86_64_ACPI_NUMA
        depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG
-        depends on (IA64 || X86 || PPC64 || SUPERH || S390)
+        depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
 comment "Memory hotplug is currently incompatible with Software Suspend"
        depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5a37e2055717..1065b715ef64 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -610,6 +610,21 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
                kthread_stop(wb->task);
 }
+/*
+ * This bdi is going away now, make sure that no super_blocks point to it
+ */
+static void bdi_prune_sb(struct backing_dev_info *bdi)
+{
+        struct super_block *sb;
+        spin_lock(&sb_lock);
+        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (sb->s_bdi == bdi)
+                        sb->s_bdi = NULL;
+        }
+        spin_unlock(&sb_lock);
+}
 void bdi_unregister(struct backing_dev_info *bdi)
 {
        if (bdi->dev) {
@@ -682,6 +697,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
                spin_unlock(&inode_lock);
        }
+        bdi_prune_sb(bdi);
        bdi_unregister(bdi);
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 729d4b15b645..dacc64183874 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -35,6 +35,7 @@
 #include <linux/mm.h>
 #include <linux/page-flags.h>
 #include <linux/sched.h>
+#include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
@@ -370,9 +371,6 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
        int ret = FAILED;
        struct address_space *mapping;
-        if (!isolate_lru_page(p))
-                page_cache_release(p);
        /*
         * For anonymous pages we're done the only reference left
         * should be the one m_f() holds.
@@ -498,30 +496,18 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
 */
 static int me_swapcache_dirty(struct page *p, unsigned long pfn)
 {
-        int ret = FAILED;
        ClearPageDirty(p);
        /* Trigger EIO in shmem: */
        ClearPageUptodate(p);
-        if (!isolate_lru_page(p)) {
+        return DELAYED;
-                page_cache_release(p);
-                ret = DELAYED;
-        }
-        return ret;
 }
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
 {
-        int ret = FAILED;
-        if (!isolate_lru_page(p)) {
-                page_cache_release(p);
-                ret = RECOVERED;
-        }
        delete_from_swap_cache(p);
-        return ret;
+        return RECOVERED;
 }
 /*
@@ -611,8 +597,6 @@ static struct page_state {
        { 0,            0,              "unknown page state",   me_unknown },
 };
-#undef lru
 static void action_result(unsigned long pfn, char *msg, int result)
 {
        struct page *page = NULL;
@@ -629,13 +613,16 @@ static int page_action(struct page_state *ps, struct page *p,
                        unsigned long pfn, int ref)
 {
        int result;
+        int count;
        result = ps->action(p, pfn);
        action_result(pfn, ps->msg, result);
-        if (page_count(p) != 1 + ref)
+        count = page_count(p) - 1 - ref;
+        if (count != 0)
                printk(KERN_ERR
                       "MCE %#lx: %s page still referenced by %d users\n",
-                       pfn, ps->msg, page_count(p) - 1);
+                       pfn, ps->msg, count);
        /* Could do more checks here if page looks ok */
        /*
@@ -661,12 +648,9 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
        int i;
        int kill = 1;
-        if (PageReserved(p) || PageCompound(p) || PageSlab(p))
+        if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
                return;
-        if (!PageLRU(p))
-                lru_add_drain_all();
        /*
         * This check implies we don't kill processes if their pages
         * are in the swap cache early. Those are always late kills.
@@ -738,6 +722,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
 int __memory_failure(unsigned long pfn, int trapno, int ref)
 {
+        unsigned long lru_flag;
        struct page_state *ps;
        struct page *p;
        int res;
@@ -775,6 +760,24 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
        }
        /*
+         * We ignore non-LRU pages for good reasons.
+         * - PG_locked is only well defined for LRU pages and a few others
+         * - to avoid races with __set_page_locked()
+         * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
+         * The check (unnecessarily) ignores LRU pages being isolated and
+         * walked by the page reclaim code, however that's not a big loss.
+         */
+        if (!PageLRU(p))
+                lru_add_drain_all();
+        lru_flag = p->flags & lru;
+        if (isolate_lru_page(p)) {
+                action_result(pfn, "non LRU", IGNORED);
+                put_page(p);
+                return -EBUSY;
+        }
+        page_cache_release(p);
+        /*
         * Lock the page and wait for writeback to finish.
         * It's very difficult to mess with pages currently under IO
         * and in many cases impossible, so we just avoid it here.
@@ -790,7 +793,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
        /*
         * Torn down by someone else?
         */
-        if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
+        if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
                action_result(pfn, "already truncated LRU", IGNORED);
                res = 0;
                goto out;
@@ -798,7 +801,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
        res = -EBUSY;
        for (ps = error_states;; ps++) {
-                if ((p->flags & ps->mask) == ps->res) {
+                if (((p->flags | lru_flag)& ps->mask) == ps->res) {
                        res = page_action(ps, p, pfn, ref);
                        break;
                }
diff --git a/mm/memory.c b/mm/memory.c
index 7e91b5f9f690..6ab19dd4a199 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -641,6 +641,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
                unsigned long addr, unsigned long end)
 {
+        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
        spinlock_t *src_ptl, *dst_ptl;
        int progress = 0;
@@ -654,6 +655,8 @@ again:
        src_pte = pte_offset_map_nested(src_pmd, addr);
        src_ptl = pte_lockptr(src_mm, src_pmd);
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+        orig_src_pte = src_pte;
+        orig_dst_pte = dst_pte;
        arch_enter_lazy_mmu_mode();
        do {
@@ -677,9 +680,9 @@ again:
        arch_leave_lazy_mmu_mode();
        spin_unlock(src_ptl);
-        pte_unmap_nested(src_pte - 1);
+        pte_unmap_nested(orig_src_pte);
        add_mm_rss(dst_mm, rss[0], rss[1]);
-        pte_unmap_unlock(dst_pte - 1, dst_ptl);
+        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();
        if (addr != end)
                goto again;
@@ -1820,10 +1823,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
        token = pmd_pgtable(*pmd);
        do {
-                err = fn(pte, token, addr, data);
+                err = fn(pte++, token, addr, data);
                if (err)
                        break;
-        } while (pte++, addr += PAGE_SIZE, addr != end);
+        } while (addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
@@ -2539,7 +2542,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        } else if (PageHWPoison(page)) {
                ret = VM_FAULT_HWPOISON;
                delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
-                goto out;
+                goto out_release;
        }
        lock_page(page);
@@ -2611,6 +2614,7 @@ out_nomap:
        pte_unmap_unlock(page_table, ptl);
 out_page:
        unlock_page(page);
+out_release:
        page_cache_release(page);
        return ret;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7dd9d9f80694..4545d5944243 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1024,7 +1024,7 @@ static long do_mbind(unsigned long start, unsigned long len,
                err = migrate_prep();
                if (err)
-                        return err;
+                        goto mpol_out;
        }
        {
                NODEMASK_SCRATCH(scratch);
@@ -1039,10 +1039,9 @@ static long do_mbind(unsigned long start, unsigned long len,
                        err = -ENOMEM;
                NODEMASK_SCRATCH_FREE(scratch);
        }
-        if (err) {
+        if (err)
-                mpol_put(new);
+                goto mpol_out;
-                return err;
-        }
        vma = check_range(mm, start, end, nmask,
                          flags | MPOL_MF_INVERT, &pagelist);
@@ -1058,9 +1057,11 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
-        }
+        } else
+                putback_lru_pages(&pagelist);
        up_write(&mm->mmap_sem);
+ mpol_out:
        mpol_put(new);
        return err;
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 5189b5aed8c0..9876fa0c3ad3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1362,9 +1362,11 @@ share:
 error_just_free:
        up_write(&nommu_region_sem);
 error:
-        fput(region->vm_file);
+        if (region->vm_file)
+                fput(region->vm_file);
        kmem_cache_free(vm_region_jar, region);
-        fput(vma->vm_file);
+        if (vma->vm_file)
+                fput(vma->vm_file);
        if (vma->vm_flags & VM_EXECUTABLE)
                removed_exe_file_vma(vma->vm_mm);
        kmem_cache_free(vm_area_cachep, vma);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bf720550b44d..cdcedf661616 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2183,7 +2183,7 @@ void show_free_areas(void)
        printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
                " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
                " unevictable:%lu"
-                " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n"
+                " dirty:%lu writeback:%lu unstable:%lu\n"
                " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
                " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
                global_page_state(NR_ACTIVE_ANON),
@@ -2196,7 +2196,6 @@ void show_free_areas(void)
                global_page_state(NR_FILE_DIRTY),
                global_page_state(NR_WRITEBACK),
                global_page_state(NR_UNSTABLE_NFS),
-                nr_blockdev_pages(),
                global_page_state(NR_FREE_PAGES),
                global_page_state(NR_SLAB_RECLAIMABLE),
                global_page_state(NR_SLAB_UNRECLAIMABLE),
diff --git a/mm/percpu.c b/mm/percpu.c
index 6af78c1ee704..d90797160c2a 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -153,7 +153,10 @@ static int pcpu_reserved_chunk_limit;
 *
 * During allocation, pcpu_alloc_mutex is kept locked all the time and
 * pcpu_lock is grabbed and released as necessary.  All actual memory
- * allocations are done using GFP_KERNEL with pcpu_lock released.
+ * allocations are done using GFP_KERNEL with pcpu_lock released.  In
+ * general, percpu memory can't be allocated with irq off but
+ * irqsave/restore are still used in alloc path so that it can be used
+ * from early init path - sched_init() specifically.
 *
 * Free path accesses and alters only the index data structures, so it
 * can be safely called from atomic context.  When memory needs to be
@@ -366,7 +369,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 * RETURNS:
 * 0 if noop, 1 if successfully extended, -errno on failure.
 */
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk, unsigned long *flags)
 {
        int new_alloc;
        int *new;
@@ -376,7 +379,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
        if (chunk->map_alloc >= chunk->map_used + 2)
                return 0;
-        spin_unlock_irq(&pcpu_lock);
+        spin_unlock_irqrestore(&pcpu_lock, *flags);
        new_alloc = PCPU_DFL_MAP_ALLOC;
        while (new_alloc < chunk->map_used + 2)
@@ -384,7 +387,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
        new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
        if (!new) {
-                spin_lock_irq(&pcpu_lock);
+                spin_lock_irqsave(&pcpu_lock, *flags);
                return -ENOMEM;
        }
@@ -393,7 +396,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
         * could have happened inbetween, so map_used couldn't have
         * grown.
         */
-        spin_lock_irq(&pcpu_lock);
+        spin_lock_irqsave(&pcpu_lock, *flags);
        BUG_ON(new_alloc < chunk->map_used + 2);
        size = chunk->map_alloc * sizeof(chunk->map[0]);
@@ -1047,6 +1050,7 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
        struct pcpu_chunk *chunk;
        const char *err;
        int slot, off;
+        unsigned long flags;
        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
                WARN(true, "illegal size (%zu) or align (%zu) for "
@@ -1055,13 +1059,13 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
        }
        mutex_lock(&pcpu_alloc_mutex);
-        spin_lock_irq(&pcpu_lock);
+        spin_lock_irqsave(&pcpu_lock, flags);
        /* serve reserved allocations from the reserved chunk if available */
        if (reserved && pcpu_reserved_chunk) {
                chunk = pcpu_reserved_chunk;
                if (size > chunk->contig_hint ||
-                    pcpu_extend_area_map(chunk) < 0) {
+                    pcpu_extend_area_map(chunk, &flags) < 0) {
                        err = "failed to extend area map of reserved chunk";
                        goto fail_unlock;
                }
@@ -1079,7 +1083,7 @@ restart:
                        if (size > chunk->contig_hint)
                                continue;
-                        switch (pcpu_extend_area_map(chunk)) {
+                        switch (pcpu_extend_area_map(chunk, &flags)) {
                        case 0:
                                break;
                        case 1:
@@ -1096,7 +1100,7 @@ restart:
        }
        /* hmmm... no space left, create a new chunk */
-        spin_unlock_irq(&pcpu_lock);
+        spin_unlock_irqrestore(&pcpu_lock, flags);
        chunk = alloc_pcpu_chunk();
        if (!chunk) {
@@ -1104,16 +1108,16 @@ restart:
                goto fail_unlock_mutex;
        }
-        spin_lock_irq(&pcpu_lock);
+        spin_lock_irqsave(&pcpu_lock, flags);
        pcpu_chunk_relocate(chunk, -1);
        goto restart;
 area_found:
-        spin_unlock_irq(&pcpu_lock);
+        spin_unlock_irqrestore(&pcpu_lock, flags);
        /* populate, map and clear the area */
        if (pcpu_populate_chunk(chunk, off, size)) {
-                spin_lock_irq(&pcpu_lock);
+                spin_lock_irqsave(&pcpu_lock, flags);
                pcpu_free_area(chunk, off);
                err = "failed to populate";
                goto fail_unlock;
@@ -1125,7 +1129,7 @@ area_found:
        return __addr_to_pcpu_ptr(chunk->base_addr + off);
 fail_unlock:
-        spin_unlock_irq(&pcpu_lock);
+        spin_unlock_irqrestore(&pcpu_lock, flags);
 fail_unlock_mutex:
        mutex_unlock(&pcpu_alloc_mutex);
        if (warn_limit) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a1bc6b9af9a2..9c590eef7912 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1151,8 +1151,7 @@ static int try_to_unuse(unsigned int type)
                                } else
                                        retval = unuse_mm(mm, entry, page);
-                                if (set_start_mm &&
+                                if (set_start_mm && *swap_map < swcount) {
-                                    swap_count(*swap_map) < swcount) {
                                        mmput(new_start_mm);
                                        atomic_inc(&mm->mm_users);
                                        new_start_mm = mm;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 64e438898832..777af57fd8c8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -544,6 +544,16 @@ redo:
                 */
                lru = LRU_UNEVICTABLE;
                add_page_to_unevictable_list(page);
+                /*
+                 * When racing with an mlock clearing (page is
+                 * unlocked), make sure that if the other thread does
+                 * not observe our setting of PG_lru and fails
+                 * isolation, we see PG_mlocked cleared below and move
+                 * the page back to the evictable list.
+                 *
+                 * The other side is TestClearPageMlocked().
+                 */
+                smp_mb();
        }
        /*
@@ -1088,7 +1098,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
        int lumpy_reclaim = 0;
        while (unlikely(too_many_isolated(zone, file, sc))) {
-                congestion_wait(WRITE, HZ/10);
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
                /* We are about to die and free our memory. Return now. */
                if (fatal_signal_pending(current))
@@ -1356,7 +1366,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                         * IO, plus JVM can create lots of anon VM_EXEC pages,
                         * so we ignore them here.
                         */
-                        if ((vm_flags & VM_EXEC) && !PageAnon(page)) {
+                        if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
                                list_add(&page->lru, &l_active);
                                continue;
                        }