19 files changed, 323 insertions, 171 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3ad483bdf505..e9c0c61f2ddd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS
 config COMPACTION
        bool "Allow for memory compaction"
        select MIGRATION
-        depends on EXPERIMENTAL && HUGETLB_PAGE && MMU
+        depends on MMU
        help
          Allows the compaction of memory for the allocation of huge pages.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e187454d82f6..113e35c47502 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -650,10 +650,10 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag)
 static inline struct page *alloc_hugepage_vma(int defrag,
                                              struct vm_area_struct *vma,
-                                              unsigned long haddr)
+                                              unsigned long haddr, int nd)
 {
        return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
-                               HPAGE_PMD_ORDER, vma, haddr);
+                               HPAGE_PMD_ORDER, vma, haddr, nd);
 }
 #ifndef CONFIG_NUMA
@@ -678,7 +678,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                if (unlikely(khugepaged_enter(vma)))
                        return VM_FAULT_OOM;
                page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-                                          vma, haddr);
+                                          vma, haddr, numa_node_id());
                if (unlikely(!page))
                        goto out;
                if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
@@ -799,8 +799,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        }
        for (i = 0; i < HPAGE_PMD_NR; i++) {
-                pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+                pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE,
-                                          vma, address);
+                                               vma, address, page_to_nid(page));
                if (unlikely(!pages[i] ||
                             mem_cgroup_newpage_charge(pages[i], mm,
                                                       GFP_KERNEL))) {
@@ -902,7 +902,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow())
                new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-                                              vma, haddr);
+                                              vma, haddr, numa_node_id());
        else
                new_page = NULL;
@@ -1162,7 +1162,12 @@ static void __split_huge_page_refcount(struct page *page)
                /* after clearing PageTail the gup refcount can be released */
                smp_mb();
-                page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+                /*
+                 * retain hwpoison flag of the poisoned tail page:
+                 *   fix for the unsuitable process killed on Guest Machine(KVM)
+                 *   by the memory-failure.
+                 */
+                page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
                page_tail->flags |= (page->flags &
                                     ((1L << PG_referenced) |
                                      (1L << PG_swapbacked) |
@@ -1740,7 +1745,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 static void collapse_huge_page(struct mm_struct *mm,
                               unsigned long address,
                               struct page **hpage,
-                               struct vm_area_struct *vma)
+                               struct vm_area_struct *vma,
+                               int node)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -1756,6 +1762,10 @@ static void collapse_huge_page(struct mm_struct *mm,
 #ifndef CONFIG_NUMA
        VM_BUG_ON(!*hpage);
        new_page = *hpage;
+        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+                up_read(&mm->mmap_sem);
+                return;
+        }
 #else
        VM_BUG_ON(*hpage);
        /*
@@ -1768,18 +1778,19 @@ static void collapse_huge_page(struct mm_struct *mm,
         * mmap_sem in read mode is good idea also to allow greater
         * scalability.
         */
-        new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+        new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+                                      node);
        if (unlikely(!new_page)) {
                up_read(&mm->mmap_sem);
                *hpage = ERR_PTR(-ENOMEM);
                return;
        }
-#endif
        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
                up_read(&mm->mmap_sem);
                put_page(new_page);
                return;
        }
+#endif
        /* after allocating the hugepage upgrade to mmap_sem write mode */
        up_read(&mm->mmap_sem);
@@ -1806,6 +1817,8 @@ static void collapse_huge_page(struct mm_struct *mm,
        /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
        if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
                goto out;
+        if (is_vma_temporary_stack(vma))
+                goto out;
        VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
        pgd = pgd_offset(mm, address);
@@ -1847,7 +1860,6 @@ static void collapse_huge_page(struct mm_struct *mm,
                set_pmd_at(mm, address, pmd, _pmd);
                spin_unlock(&mm->page_table_lock);
                anon_vma_unlock(vma->anon_vma);
-                mem_cgroup_uncharge_page(new_page);
                goto out;
        }
@@ -1893,6 +1905,7 @@ out_up_write:
        return;
 out:
+        mem_cgroup_uncharge_page(new_page);
 #ifdef CONFIG_NUMA
        put_page(new_page);
 #endif
@@ -1912,6 +1925,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        struct page *page;
        unsigned long _address;
        spinlock_t *ptl;
+        int node = -1;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -1942,6 +1956,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                page = vm_normal_page(vma, _address, pteval);
                if (unlikely(!page))
                        goto out_unmap;
+                /*
+                 * Chose the node of the first page. This could
+                 * be more sophisticated and look at more pages,
+                 * but isn't for now.
+                 */
+                if (node == -1)
+                        node = page_to_nid(page);
                VM_BUG_ON(PageCompound(page));
                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
                        goto out_unmap;
@@ -1958,7 +1979,7 @@ out_unmap:
        pte_unmap_unlock(pte, ptl);
        if (ret)
                /* collapse_huge_page will return with the mmap_sem released */
-                collapse_huge_page(mm, address, hpage, vma);
+                collapse_huge_page(mm, address, hpage, vma, node);
 out:
        return ret;
 }
@@ -2027,32 +2048,27 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                if ((!(vma->vm_flags & VM_HUGEPAGE) &&
                     !khugepaged_always()) ||
                    (vma->vm_flags & VM_NOHUGEPAGE)) {
+                skip:
                        progress++;
                        continue;
                }
                /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
-                if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
+                if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
-                        khugepaged_scan.address = vma->vm_end;
+                        goto skip;
-                        progress++;
+                if (is_vma_temporary_stack(vma))
-                        continue;
+                        goto skip;
-                }
                VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
                hend = vma->vm_end & HPAGE_PMD_MASK;
-                if (hstart >= hend) {
+                if (hstart >= hend)
-                        progress++;
+                        goto skip;
-                        continue;
+                if (khugepaged_scan.address > hend)
-                }
+                        goto skip;
                if (khugepaged_scan.address < hstart)
                        khugepaged_scan.address = hstart;
-                if (khugepaged_scan.address > hend) {
+                VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
-                        khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
-                        progress++;
-                        continue;
-                }
-                BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
                while (khugepaged_scan.address < hend) {
                        int ret;
@@ -2081,7 +2097,7 @@ breakouterloop:
 breakouterloop_mmap_sem:
        spin_lock(&khugepaged_mm_lock);
-        BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+        VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
        /*
         * Release the current mm_slot if this mm is about to die, or
         * if we scanned all vmas of this mm.
@@ -2236,9 +2252,9 @@ static int khugepaged(void *none)
        for (;;) {
                mutex_unlock(&khugepaged_mutex);
-                BUG_ON(khugepaged_thread != current);
+                VM_BUG_ON(khugepaged_thread != current);
                khugepaged_loop();
-                BUG_ON(khugepaged_thread != current);
+                VM_BUG_ON(khugepaged_thread != current);
                mutex_lock(&khugepaged_mutex);
                if (!khugepaged_enabled())
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index 177a5169bbde..ff0d9779cec8 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void)
         * after the module is removed.
         */
        for (i = 0; i < 10; i++) {
-                elem = kmalloc(sizeof(*elem), GFP_KERNEL);
+                elem = kzalloc(sizeof(*elem), GFP_KERNEL);
-                pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem);
+                pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
                if (!elem)
                        return -ENOMEM;
-                memset(elem, 0, sizeof(*elem));
                INIT_LIST_HEAD(&elem->list);
                list_add_tail(&elem->list, &test_list);
        }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index bd9bc214091b..84225f3b7190 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -113,7 +113,9 @@
 #define BYTES_PER_POINTER       sizeof(void *)
 /* GFP bitmask for kmemleak internal allocations */
-#define GFP_KMEMLEAK_MASK       (GFP_KERNEL | GFP_ATOMIC)
+#define gfp_kmemleak_mask(gfp)  (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
+                                 __GFP_NORETRY | __GFP_NOMEMALLOC | \
+                                 __GFP_NOWARN)
 /* scanning area inside a memory block */
 struct kmemleak_scan_area {
@@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
        struct kmemleak_object *object;
        struct prio_tree_node *node;
-        object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
+        object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
        if (!object) {
-                kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
+                pr_warning("Cannot allocate a kmemleak_object structure\n");
+                kmemleak_disable();
                return NULL;
        }
@@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
                return;
        }
-        area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK);
+        area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
        if (!area) {
-                kmemleak_warn("Cannot allocate a scan area\n");
+                pr_warning("Cannot allocate a scan area\n");
                goto out;
        }
diff --git a/mm/memblock.c b/mm/memblock.c
index bdba245d8afd..4618fda975a0 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -137,8 +137,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
        BUG_ON(0 == size);
-        size = memblock_align_up(size, align);
        /* Pump up max_addr */
        if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
                end = memblock.current_limit;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index db76ef726293..da53a252b259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -612,8 +612,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        /* pagein of a big page is an event. So, ignore page size */
        if (nr_pages > 0)
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
-        else
+        else {
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
+                nr_pages = -nr_pages; /* for event */
+        }
        __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
@@ -1111,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
        return false;
 }
+/**
+ * mem_cgroup_check_margin - check if the memory cgroup allows charging
+ * @mem: memory cgroup to check
+ * @bytes: the number of bytes the caller intends to charge
+ *
+ * Returns a boolean value on whether @mem can be charged @bytes or
+ * whether this would exceed the limit.
+ */
+static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
+{
+        if (!res_counter_check_margin(&mem->res, bytes))
+                return false;
+        if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
+                return false;
+        return true;
+}
 static unsigned int get_swappiness(struct mem_cgroup *memcg)
 {
        struct cgroup *cgrp = memcg->css.cgroup;
@@ -1832,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
                if (likely(!ret))
                        return CHARGE_OK;
+                res_counter_uncharge(&mem->res, csize);
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
        } else
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+        /*
-        if (csize > PAGE_SIZE) /* change csize and retry */
+         * csize can be either a huge page (HPAGE_SIZE), a batch of
+         * regular pages (CHARGE_SIZE), or a single regular page
+         * (PAGE_SIZE).
+         *
+         * Never reclaim on behalf of optional batching, retry with a
+         * single page instead.
+         */
+        if (csize == CHARGE_SIZE)
                return CHARGE_RETRY;
        if (!(gfp_mask & __GFP_WAIT))
                return CHARGE_WOULDBLOCK;
        ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                        gfp_mask, flags);
+                                              gfp_mask, flags);
+        if (mem_cgroup_check_margin(mem_over_limit, csize))
+                return CHARGE_RETRY;
        /*
-         * try_to_free_mem_cgroup_pages() might not give us a full
+         * Even though the limit is exceeded at this point, reclaim
-         * picture of reclaim. Some pages are reclaimed and might be
+         * may have been able to free some pages.  Retry the charge
-         * moved to swap cache or just unmapped from the cgroup.
+         * before killing the task.
-         * Check the limit again to see if the reclaim reduced the
+         *
-         * current usage of the cgroup before giving up
+         * Only for regular pages, though: huge pages are rather
+         * unlikely to succeed so close to the limit, and we fall back
+         * to regular pages anyway in case of failure.
         */
-        if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+        if (csize == PAGE_SIZE && ret)
                return CHARGE_RETRY;
        /*
@@ -2144,6 +2175,8 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
        struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
        unsigned long flags;
+        if (mem_cgroup_disabled())
+                return;
        /*
         * We have no races with charge/uncharge but will have races with
         * page state accounting.
@@ -2233,7 +2266,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 {
        int ret = -EINVAL;
        unsigned long flags;
+        /*
+         * The page is isolated from LRU. So, collapse function
+         * will not handle this page. But page splitting can happen.
+         * Do this check under compound_page_lock(). The caller should
+         * hold it.
+         */
        if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
                return -EBUSY;
@@ -2265,7 +2303,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        struct cgroup *cg = child->css.cgroup;
        struct cgroup *pcg = cg->parent;
        struct mem_cgroup *parent;
-        int charge = PAGE_SIZE;
+        int page_size = PAGE_SIZE;
        unsigned long flags;
        int ret;
@@ -2278,23 +2316,26 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
                goto out;
        if (isolate_lru_page(page))
                goto put;
-        /* The page is isolated from LRU and we have no race with splitting */
-        charge = PAGE_SIZE << compound_order(page);
+        if (PageTransHuge(page))
+                page_size = HPAGE_SIZE;
        parent = mem_cgroup_from_cont(pcg);
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge);
+        ret = __mem_cgroup_try_charge(NULL, gfp_mask,
+                                &parent, false, page_size);
        if (ret || !parent)
                goto put_back;
-        if (charge > PAGE_SIZE)
+        if (page_size > PAGE_SIZE)
                flags = compound_lock_irqsave(page);
-        ret = mem_cgroup_move_account(pc, child, parent, true, charge);
+        ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
        if (ret)
-                mem_cgroup_cancel_charge(parent, charge);
+                mem_cgroup_cancel_charge(parent, page_size);
-put_back:
-        if (charge > PAGE_SIZE)
+        if (page_size > PAGE_SIZE)
                compound_unlock_irqrestore(page, flags);
+put_back:
        putback_lru_page(page);
 put:
        put_page(page);
@@ -2312,13 +2353,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask, enum charge_type ctype)
 {
        struct mem_cgroup *mem = NULL;
+        int page_size = PAGE_SIZE;
        struct page_cgroup *pc;
+        bool oom = true;
        int ret;
-        int page_size = PAGE_SIZE;
        if (PageTransHuge(page)) {
                page_size <<= compound_order(page);
                VM_BUG_ON(!PageTransHuge(page));
+                /*
+                 * Never OOM-kill a process for a huge page.  The
+                 * fault handler will fall back to regular pages.
+                 */
+                oom = false;
        }
        pc = lookup_page_cgroup(page);
@@ -2327,7 +2374,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                return 0;
        prefetchw(pc);
-        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
+        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
        if (ret || !mem)
                return ret;
@@ -5013,9 +5060,9 @@ struct cgroup_subsys mem_cgroup_subsys = {
 static int __init enable_swap_account(char *s)
 {
        /* consider enabled if no parameter or 1 is given */
-        if (!s || !strcmp(s, "1"))
+        if (!(*s) || !strcmp(s, "=1"))
                really_do_swap_account = 1;
-        else if (!strcmp(s, "0"))
+        else if (!strcmp(s, "=0"))
                really_do_swap_account = 0;
        return 1;
 }
@@ -5023,7 +5070,8 @@ __setup("swapaccount", enable_swap_account);
 static int __init disable_swap_account(char *s)
 {
-        enable_swap_account("0");
+        printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
+        enable_swap_account("=0");
        return 1;
 }
 __setup("noswapaccount", disable_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 548fbd70f026..0207c2f6f8bd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -233,8 +233,8 @@ void shake_page(struct page *p, int access)
        }
        /*
-         * Only all shrink_slab here (which would also
+         * Only call shrink_slab here (which would also shrink other caches) if
-         * shrink other caches) if access is not potentially fatal.
+         * access is not potentially fatal.
         */
        if (access) {
                int nr;
@@ -386,8 +386,6 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct task_struct *tsk;
        struct anon_vma *av;
-        if (!PageHuge(page) && unlikely(split_huge_page(page)))
-                return;
        read_lock(&tasklist_lock);
        av = page_lock_anon_vma(page);
        if (av == NULL) /* Not actually mapped anymore */
@@ -856,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        int ret;
        int kill = 1;
        struct page *hpage = compound_head(p);
+        struct page *ppage;
        if (PageReserved(p) || PageSlab(p))
                return SWAP_SUCCESS;
@@ -897,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        }
        /*
+         * ppage: poisoned page
+         *   if p is regular page(4k page)
+         *        ppage == real poisoned page;
+         *   else p is hugetlb or THP, ppage == head page.
+         */
+        ppage = hpage;
+        if (PageTransHuge(hpage)) {
+                /*
+                 * Verify that this isn't a hugetlbfs head page, the check for
+                 * PageAnon is just for avoid tripping a split_huge_page
+                 * internal debug check, as split_huge_page refuses to deal with
+                 * anything that isn't an anon page. PageAnon can't go away fro
+                 * under us because we hold a refcount on the hpage, without a
+                 * refcount on the hpage. split_huge_page can't be safely called
+                 * in the first place, having a refcount on the tail isn't
+                 * enough * to be safe.
+                 */
+                if (!PageHuge(hpage) && PageAnon(hpage)) {
+                        if (unlikely(split_huge_page(hpage))) {
+                                /*
+                                 * FIXME: if splitting THP is failed, it is
+                                 * better to stop the following operation rather
+                                 * than causing panic by unmapping. System might
+                                 * survive if the page is freed later.
+                                 */
+                                printk(KERN_INFO
+                                        "MCE %#lx: failed to split THP\n", pfn);
+                                BUG_ON(!PageHWPoison(p));
+                                return SWAP_FAIL;
+                        }
+                        /* THP is split, so ppage should be the real poisoned page. */
+                        ppage = p;
+                }
+        }
+        /*
         * First collect all the processes that have the page
         * mapped in dirty form.  This has to be done before try_to_unmap,
         * because ttu takes the rmap data structures down.
@@ -905,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * there's nothing that can be done.
         */
        if (kill)
-                collect_procs(hpage, &tokill);
+                collect_procs(ppage, &tokill);
+        if (hpage != ppage)
+                lock_page_nosync(ppage);
-        ret = try_to_unmap(hpage, ttu);
+        ret = try_to_unmap(ppage, ttu);
        if (ret != SWAP_SUCCESS)
                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
-                                pfn, page_mapcount(hpage));
+                                pfn, page_mapcount(ppage));
+        if (hpage != ppage)
+                unlock_page(ppage);
        /*
         * Now that the dirty bit has been propagated to the
@@ -921,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-        kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
+        kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
                      ret != SWAP_SUCCESS, p, pfn);
        return ret;
@@ -1022,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
         */
-        if (!PageLRU(p) && !PageHuge(p))
+        if (!PageHuge(p) && !PageTransCompound(p)) {
-                shake_page(p, 0);
+                if (!PageLRU(p))
-        if (!PageLRU(p) && !PageHuge(p)) {
+                        shake_page(p, 0);
-                /*
+                if (!PageLRU(p)) {
-                 * shake_page could have turned it free.
+                        /*
-                 */
+                         * shake_page could have turned it free.
-                if (is_free_buddy_page(p)) {
+                         */
-                        action_result(pfn, "free buddy, 2nd try", DELAYED);
+                        if (is_free_buddy_page(p)) {
-                        return 0;
+                                action_result(pfn, "free buddy, 2nd try",
+                                                DELAYED);
+                                return 0;
+                        }
+                        action_result(pfn, "non LRU", IGNORED);
+                        put_page(p);
+                        return -EBUSY;
                }
-                action_result(pfn, "non LRU", IGNORED);
-                put_page(p);
-                return -EBUSY;
        }
        /*
@@ -1064,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * For error on the tail page, we should set PG_hwpoison
         * on the head page to show that the hugepage is hwpoisoned
         */
-        if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+        if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
                action_result(pfn, "hugepage already hardware poisoned",
                                IGNORED);
                unlock_page(hpage);
@@ -1295,7 +1341,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
                                true);
        if (ret) {
-                putback_lru_pages(&pagelist);
+                struct page *page1, *page2;
+                list_for_each_entry_safe(page1, page2, &pagelist, lru)
+                        put_page(page1);
                pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
                         pfn, ret, page->flags);
                if (ret > 0)
@@ -1419,6 +1468,7 @@ int soft_offline_page(struct page *page, int flags)
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
                                                                0, true);
                if (ret) {
+                        putback_lru_pages(&pagelist);
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
                        if (ret > 0)
diff --git a/mm/memory.c b/mm/memory.c
index 31250faff390..5823698c2b71 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2219,7 +2219,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                                         &ptl);
                        if (!pte_same(*page_table, orig_pte)) {
                                unlock_page(old_page);
-                                page_cache_release(old_page);
                                goto unlock;
                        }
                        page_cache_release(old_page);
@@ -2289,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                                         &ptl);
                        if (!pte_same(*page_table, orig_pte)) {
                                unlock_page(old_page);
-                                page_cache_release(old_page);
                                goto unlock;
                        }
@@ -2367,16 +2365,6 @@ gotten:
        }
        __SetPageUptodate(new_page);
-        /*
-         * Don't let another task, with possibly unlocked vma,
-         * keep the mlocked page.
-         */
-        if ((vma->vm_flags & VM_LOCKED) && old_page) {
-                lock_page(old_page);    /* for LRU manipulation */
-                clear_page_mlock(old_page);
-                unlock_page(old_page);
-        }
        if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
@@ -2444,10 +2432,20 @@ gotten:
        if (new_page)
                page_cache_release(new_page);
-        if (old_page)
-                page_cache_release(old_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
+        if (old_page) {
+                /*
+                 * Don't let another task, with possibly unlocked vma,
+                 * keep the mlocked page.
+                 */
+                if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+                        lock_page(old_page);    /* LRU manipulation */
+                        munlock_vma_page(old_page);
+                        unlock_page(old_page);
+                }
+                page_cache_release(old_page);
+        }
        return ret;
 oom_free_new:
        page_cache_release(new_page);
@@ -2650,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping,
                details.last_index = ULONG_MAX;
        details.i_mmap_lock = &mapping->i_mmap_lock;
+        mutex_lock(&mapping->unmap_mutex);
        spin_lock(&mapping->i_mmap_lock);
        /* Protect against endless unmapping loops */
@@ -2666,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping,
        if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
                unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
@@ -3053,12 +3053,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                goto out;
                        }
                        charged = 1;
-                        /*
-                         * Don't let another task, with possibly unlocked vma,
-                         * keep the mlocked page.
-                         */
-                        if (vma->vm_flags & VM_LOCKED)
-                                clear_page_mlock(vmf.page);
                        copy_user_highpage(page, vmf.page, address, vma);
                        __SetPageUptodate(page);
                } else {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 368fc9d23610..b53ec99f1428 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 }
 /* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
+        int nd)
 {
-        int nd = numa_node_id();
        switch (policy->mode) {
        case MPOL_PREFERRED:
                if (!(policy->flags & MPOL_F_LOCAL))
@@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
                                huge_page_shift(hstate_vma(vma))), gfp_flags);
        } else {
-                zl = policy_zonelist(gfp_flags, *mpol);
+                zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
                if ((*mpol)->mode == MPOL_BIND)
                        *nodemask = &(*mpol)->v.nodes;
        }
@@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-                unsigned long addr)
+                unsigned long addr, int node)
 {
        struct mempolicy *pol = get_vma_policy(current, vma, addr);
        struct zonelist *zl;
@@ -1830,13 +1829,13 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
                unsigned nid;
-                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
+                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
                mpol_cond_put(pol);
                page = alloc_page_interleave(gfp, order, nid);
                put_mems_allowed();
                return page;
        }
-        zl = policy_zonelist(gfp, pol);
+        zl = policy_zonelist(gfp, pol, node);
        if (unlikely(mpol_needs_cond_ref(pol))) {
                /*
                 * slow path: ref counted shared policy
@@ -1892,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
        else
                page = __alloc_pages_nodemask(gfp, order,
-                        policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+                                policy_zonelist(gfp, pol, numa_node_id()),
+                                policy_nodemask(gfp, pol));
        put_mems_allowed();
        return page;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 46fe8cc13d67..352de555626c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -772,6 +772,7 @@ uncharge:
 unlock:
        unlock_page(page);
+move_newpage:
        if (rc != -EAGAIN) {
                /*
                 * A page that has been migrated has all references
@@ -785,8 +786,6 @@ unlock:
                putback_lru_page(page);
        }
-move_newpage:
        /*
         * Move the new page to the LRU. If migration was not successful
         * then this will free the page.
@@ -888,7 +887,7 @@ out:
 * are movable anymore because to has become empty
 * or no retryable pages exist anymore.
 * Caller should call putback_lru_pages to return pages to the LRU
- * or free list.
+ * or free list only if ret != 0.
 *
 * Return: Number of pages not migrated or error code.
 */
@@ -981,10 +980,6 @@ int migrate_huge_pages(struct list_head *from,
        }
        rc = 0;
 out:
-        list_for_each_entry_safe(page, page2, from, lru)
-                put_page(page);
        if (rc)
                return rc;
@@ -1292,14 +1287,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
                return -EPERM;
        /* Find the mm_struct */
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
                return -ESRCH;
        }
        mm = get_task_mm(task);
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        if (!mm)
                return -EINVAL;
diff --git a/mm/mlock.c b/mm/mlock.c
index 13e81ee8be9d..c3924c7f00be 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -178,6 +178,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
                gup_flags |= FOLL_WRITE;
+        /*
+         * We want mlock to succeed for regions that have any permissions
+         * other than PROT_NONE.
+         */
+        if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+                gup_flags |= FOLL_FORCE;
        if (vma->vm_flags & VM_LOCKED)
                gup_flags |= FOLL_MLOCK;
diff --git a/mm/mremap.c b/mm/mremap.c
index 9925b6391b80..1de98d492ddc 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -94,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                 */
                mapping = vma->vm_file->f_mapping;
                spin_lock(&mapping->i_mmap_lock);
-                if (new_vma->vm_truncate_count &&
+                new_vma->vm_truncate_count = 0;
-                    new_vma->vm_truncate_count != vma->vm_truncate_count)
-                        new_vma->vm_truncate_count = 0;
        }
        /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 90c1439549fd..cdef1d4b4e47 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1088,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
                pset = per_cpu_ptr(zone->pageset, cpu);
                pcp = &pset->pcp;
-                free_pcppages_bulk(zone, pcp->count, pcp);
+                if (pcp->count) {
-                pcp->count = 0;
+                        free_pcppages_bulk(zone, pcp->count, pcp);
+                        pcp->count = 0;
+                }
                local_irq_restore(flags);
        }
 }
@@ -2034,6 +2036,14 @@ restart:
         */
        alloc_flags = gfp_to_alloc_flags(gfp_mask);
+        /*
+         * Find the true preferred zone if the allocation is unconstrained by
+         * cpusets.
+         */
+        if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+                first_zones_zonelist(zonelist, high_zoneidx, NULL,
+                                        &preferred_zone);
        /* This is the last chance, in general, before the goto nopage. */
        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                        high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2192,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        get_mems_allowed();
        /* The preferred zone is used for statistics later */
-        first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+        first_zones_zonelist(zonelist, high_zoneidx,
+                                nodemask ? : &cpuset_current_mems_allowed,
+                                &preferred_zone);
        if (!preferred_zone) {
                put_mems_allowed();
                return NULL;
@@ -5364,10 +5376,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
        for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
                unsigned long check = pfn + iter;
-                if (!pfn_valid_within(check)) {
+                if (!pfn_valid_within(check))
-                        iter++;
                        continue;
-                }
                page = pfn_to_page(check);
                if (!page_count(page)) {
                        if (PageBuddy(page))
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0369f5b3ba1b..eb663fb533e0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -6,6 +6,7 @@
 *  Copyright (C) 2010  Linus Torvalds
 */
+#include <linux/pagemap.h>
 #include <asm/tlb.h>
 #include <asm-generic/pgtable.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index f21f4a1d6a1c..941bf82e8961 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -497,41 +497,51 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
        struct mm_struct *mm = vma->vm_mm;
        int referenced = 0;
-        /*
-         * Don't want to elevate referenced for mlocked page that gets this far,
-         * in order that it progresses to try_to_unmap and is moved to the
-         * unevictable list.
-         */
-        if (vma->vm_flags & VM_LOCKED) {
-                *mapcount = 0;  /* break early from loop */
-                *vm_flags |= VM_LOCKED;
-                goto out;
-        }
-        /* Pretend the page is referenced if the task has the
-           swap token and is in the middle of a page fault. */
-        if (mm != current->mm && has_swap_token(mm) &&
-                        rwsem_is_locked(&mm->mmap_sem))
-                referenced++;
        if (unlikely(PageTransHuge(page))) {
                pmd_t *pmd;
                spin_lock(&mm->page_table_lock);
+                /*
+                 * rmap might return false positives; we must filter
+                 * these out using page_check_address_pmd().
+                 */
                pmd = page_check_address_pmd(page, mm, address,
                                             PAGE_CHECK_ADDRESS_PMD_FLAG);
-                if (pmd && !pmd_trans_splitting(*pmd) &&
+                if (!pmd) {
-                    pmdp_clear_flush_young_notify(vma, address, pmd))
+                        spin_unlock(&mm->page_table_lock);
+                        goto out;
+                }
+                if (vma->vm_flags & VM_LOCKED) {
+                        spin_unlock(&mm->page_table_lock);
+                        *mapcount = 0;  /* break early from loop */
+                        *vm_flags |= VM_LOCKED;
+                        goto out;
+                }
+                /* go ahead even if the pmd is pmd_trans_splitting() */
+                if (pmdp_clear_flush_young_notify(vma, address, pmd))
                        referenced++;
                spin_unlock(&mm->page_table_lock);
        } else {
                pte_t *pte;
                spinlock_t *ptl;
+                /*
+                 * rmap might return false positives; we must filter
+                 * these out using page_check_address().
+                 */
                pte = page_check_address(page, mm, address, &ptl, 0);
                if (!pte)
                        goto out;
+                if (vma->vm_flags & VM_LOCKED) {
+                        pte_unmap_unlock(pte, ptl);
+                        *mapcount = 0;  /* break early from loop */
+                        *vm_flags |= VM_LOCKED;
+                        goto out;
+                }
                if (ptep_clear_flush_young_notify(vma, address, pte)) {
                        /*
                         * Don't treat a reference through a sequentially read
@@ -546,6 +556,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                pte_unmap_unlock(pte, ptl);
        }
+        /* Pretend the page is referenced if the task has the
+           swap token and is in the middle of a page fault. */
+        if (mm != current->mm && has_swap_token(mm) &&
+                        rwsem_is_locked(&mm->mmap_sem))
+                referenced++;
        (*mapcount)--;
        if (referenced)
diff --git a/mm/shmem.c b/mm/shmem.c
index 5ee67c990602..3437b65d6d6e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2144,8 +2144,10 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
 {
        struct inode *inode = dentry->d_inode;
-        if (*len < 3)
+        if (*len < 3) {
+                *len = 3;
                return 255;
+        }
        if (inode_unhashed(inode)) {
                /* Unfortunately insert_inode_hash is not idempotent,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 07a458d72fa8..0341c5700e34 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1940,7 +1940,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        error = -EINVAL;
        if (S_ISBLK(inode->i_mode)) {
-                bdev = I_BDEV(inode);
+                bdev = bdgrab(I_BDEV(inode));
                error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
                                   sys_swapon);
                if (error < 0) {
diff --git a/mm/truncate.c b/mm/truncate.c
index 49feb46e77b8..d64296be00d3 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -225,6 +225,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
        next = start;
        while (next <= end &&
               pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
                        pgoff_t page_index = page->index;
@@ -247,6 +248,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        unlock_page(page);
                }
                pagevec_release(&pvec);
+                mem_cgroup_uncharge_end();
                cond_resched();
        }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f5d90dedebba..6771ea70bfe7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1841,16 +1841,28 @@ static inline bool should_continue_reclaim(struct zone *zone,
        if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
                return false;
-        /*
+        /* Consider stopping depending on scan and reclaim activity */
-         * If we failed to reclaim and have scanned the full list, stop.
+        if (sc->gfp_mask & __GFP_REPEAT) {
-         * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
+                /*
-         *       faster but obviously would be less likely to succeed
+                 * For __GFP_REPEAT allocations, stop reclaiming if the
-         *       allocation. If this is desirable, use GFP_REPEAT to decide
+                 * full LRU list has been scanned and we are still failing
-         *       if both reclaimed and scanned should be checked or just
+                 * to reclaim pages. This full LRU scan is potentially
-         *       reclaimed
+                 * expensive but a __GFP_REPEAT caller really wants to succeed
-         */
+                 */
-        if (!nr_reclaimed && !nr_scanned)
+                if (!nr_reclaimed && !nr_scanned)
-                return false;
+                        return false;
+        } else {
+                /*
+                 * For non-__GFP_REPEAT allocations which can presumably
+                 * fail without consequence, stop if we failed to reclaim
+                 * any pages from the last SWAP_CLUSTER_MAX number of
+                 * pages that were scanned. This will return to the
+                 * caller faster at the risk reclaim/compaction and
+                 * the resulting allocation attempt fails
+                 */
+                if (!nr_reclaimed)
+                        return false;
+        }
        /*
         * If we have not reclaimed enough pages for compaction and the
@@ -1882,12 +1894,12 @@ static void shrink_zone(int priority, struct zone *zone,
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list l;
-        unsigned long nr_reclaimed;
+        unsigned long nr_reclaimed, nr_scanned;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
-        unsigned long nr_scanned = sc->nr_scanned;
 restart:
        nr_reclaimed = 0;
+        nr_scanned = sc->nr_scanned;
        get_scan_count(zone, sc, nr, priority);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2083,7 +2095,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                        struct zone *preferred_zone;
                        first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
-                                                        NULL, &preferred_zone);
+                                                &cpuset_current_mems_allowed,
+                                                &preferred_zone);
                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
                }
        }