8 files changed, 169 insertions, 88 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e187454d82f6..3e29781ee762 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1162,7 +1162,12 @@ static void __split_huge_page_refcount(struct page *page)
                /* after clearing PageTail the gup refcount can be released */
                smp_mb();
-                page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+                /*
+                 * retain hwpoison flag of the poisoned tail page:
+                 *   fix for the unsuitable process killed on Guest Machine(KVM)
+                 *   by the memory-failure.
+                 */
+                page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
                page_tail->flags |= (page->flags &
                                     ((1L << PG_referenced) |
                                      (1L << PG_swapbacked) |
@@ -1806,6 +1811,8 @@ static void collapse_huge_page(struct mm_struct *mm,
        /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
        if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
                goto out;
+        if (is_vma_temporary_stack(vma))
+                goto out;
        VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
        pgd = pgd_offset(mm, address);
@@ -1847,7 +1854,6 @@ static void collapse_huge_page(struct mm_struct *mm,
                set_pmd_at(mm, address, pmd, _pmd);
                spin_unlock(&mm->page_table_lock);
                anon_vma_unlock(vma->anon_vma);
-                mem_cgroup_uncharge_page(new_page);
                goto out;
        }
@@ -1893,6 +1899,7 @@ out_up_write:
        return;
 out:
+        mem_cgroup_uncharge_page(new_page);
 #ifdef CONFIG_NUMA
        put_page(new_page);
 #endif
@@ -2027,32 +2034,27 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                if ((!(vma->vm_flags & VM_HUGEPAGE) &&
                     !khugepaged_always()) ||
                    (vma->vm_flags & VM_NOHUGEPAGE)) {
+                skip:
                        progress++;
                        continue;
                }
                /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
-                if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
+                if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
-                        khugepaged_scan.address = vma->vm_end;
+                        goto skip;
-                        progress++;
+                if (is_vma_temporary_stack(vma))
-                        continue;
+                        goto skip;
-                }
                VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
                hend = vma->vm_end & HPAGE_PMD_MASK;
-                if (hstart >= hend) {
+                if (hstart >= hend)
-                        progress++;
+                        goto skip;
-                        continue;
+                if (khugepaged_scan.address > hend)
-                }
+                        goto skip;
                if (khugepaged_scan.address < hstart)
                        khugepaged_scan.address = hstart;
-                if (khugepaged_scan.address > hend) {
+                VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
-                        khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
-                        progress++;
-                        continue;
-                }
-                BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
                while (khugepaged_scan.address < hend) {
                        int ret;
@@ -2081,7 +2083,7 @@ breakouterloop:
 breakouterloop_mmap_sem:
        spin_lock(&khugepaged_mm_lock);
-        BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+        VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
        /*
         * Release the current mm_slot if this mm is about to die, or
         * if we scanned all vmas of this mm.
@@ -2236,9 +2238,9 @@ static int khugepaged(void *none)
        for (;;) {
                mutex_unlock(&khugepaged_mutex);
-                BUG_ON(khugepaged_thread != current);
+                VM_BUG_ON(khugepaged_thread != current);
                khugepaged_loop();
-                BUG_ON(khugepaged_thread != current);
+                VM_BUG_ON(khugepaged_thread != current);
                mutex_lock(&khugepaged_mutex);
                if (!khugepaged_enabled())
diff --git a/mm/memblock.c b/mm/memblock.c
index bdba245d8afd..4618fda975a0 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -137,8 +137,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
        BUG_ON(0 == size);
-        size = memblock_align_up(size, align);
        /* Pump up max_addr */
        if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
                end = memblock.current_limit;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3878cfe399dc..da53a252b259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -612,8 +612,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        /* pagein of a big page is an event. So, ignore page size */
        if (nr_pages > 0)
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
-        else
+        else {
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
+                nr_pages = -nr_pages; /* for event */
+        }
        __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
@@ -1111,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
        return false;
 }
+/**
+ * mem_cgroup_check_margin - check if the memory cgroup allows charging
+ * @mem: memory cgroup to check
+ * @bytes: the number of bytes the caller intends to charge
+ *
+ * Returns a boolean value on whether @mem can be charged @bytes or
+ * whether this would exceed the limit.
+ */
+static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
+{
+        if (!res_counter_check_margin(&mem->res, bytes))
+                return false;
+        if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
+                return false;
+        return true;
+}
 static unsigned int get_swappiness(struct mem_cgroup *memcg)
 {
        struct cgroup *cgrp = memcg->css.cgroup;
@@ -1837,23 +1856,34 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
        } else
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+        /*
-        if (csize > PAGE_SIZE) /* change csize and retry */
+         * csize can be either a huge page (HPAGE_SIZE), a batch of
+         * regular pages (CHARGE_SIZE), or a single regular page
+         * (PAGE_SIZE).
+         *
+         * Never reclaim on behalf of optional batching, retry with a
+         * single page instead.
+         */
+        if (csize == CHARGE_SIZE)
                return CHARGE_RETRY;
        if (!(gfp_mask & __GFP_WAIT))
                return CHARGE_WOULDBLOCK;
        ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                        gfp_mask, flags);
+                                              gfp_mask, flags);
+        if (mem_cgroup_check_margin(mem_over_limit, csize))
+                return CHARGE_RETRY;
        /*
-         * try_to_free_mem_cgroup_pages() might not give us a full
+         * Even though the limit is exceeded at this point, reclaim
-         * picture of reclaim. Some pages are reclaimed and might be
+         * may have been able to free some pages.  Retry the charge
-         * moved to swap cache or just unmapped from the cgroup.
+         * before killing the task.
-         * Check the limit again to see if the reclaim reduced the
+         *
-         * current usage of the cgroup before giving up
+         * Only for regular pages, though: huge pages are rather
+         * unlikely to succeed so close to the limit, and we fall back
+         * to regular pages anyway in case of failure.
         */
-        if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+        if (csize == PAGE_SIZE && ret)
                return CHARGE_RETRY;
        /*
@@ -2323,13 +2353,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask, enum charge_type ctype)
 {
        struct mem_cgroup *mem = NULL;
+        int page_size = PAGE_SIZE;
        struct page_cgroup *pc;
+        bool oom = true;
        int ret;
-        int page_size = PAGE_SIZE;
        if (PageTransHuge(page)) {
                page_size <<= compound_order(page);
                VM_BUG_ON(!PageTransHuge(page));
+                /*
+                 * Never OOM-kill a process for a huge page.  The
+                 * fault handler will fall back to regular pages.
+                 */
+                oom = false;
        }
        pc = lookup_page_cgroup(page);
@@ -2338,7 +2374,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                return 0;
        prefetchw(pc);
-        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
+        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
        if (ret || !mem)
                return ret;
@@ -5024,9 +5060,9 @@ struct cgroup_subsys mem_cgroup_subsys = {
 static int __init enable_swap_account(char *s)
 {
        /* consider enabled if no parameter or 1 is given */
-        if (!s || !strcmp(s, "1"))
+        if (!(*s) || !strcmp(s, "=1"))
                really_do_swap_account = 1;
-        else if (!strcmp(s, "0"))
+        else if (!strcmp(s, "=0"))
                really_do_swap_account = 0;
        return 1;
 }
@@ -5034,7 +5070,8 @@ __setup("swapaccount", enable_swap_account);
 static int __init disable_swap_account(char *s)
 {
-        enable_swap_account("0");
+        printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
+        enable_swap_account("=0");
        return 1;
 }
 __setup("noswapaccount", disable_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 548fbd70f026..0207c2f6f8bd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -233,8 +233,8 @@ void shake_page(struct page *p, int access)
        }
        /*
-         * Only all shrink_slab here (which would also
+         * Only call shrink_slab here (which would also shrink other caches) if
-         * shrink other caches) if access is not potentially fatal.
+         * access is not potentially fatal.
         */
        if (access) {
                int nr;
@@ -386,8 +386,6 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct task_struct *tsk;
        struct anon_vma *av;
-        if (!PageHuge(page) && unlikely(split_huge_page(page)))
-                return;
        read_lock(&tasklist_lock);
        av = page_lock_anon_vma(page);
        if (av == NULL) /* Not actually mapped anymore */
@@ -856,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        int ret;
        int kill = 1;
        struct page *hpage = compound_head(p);
+        struct page *ppage;
        if (PageReserved(p) || PageSlab(p))
                return SWAP_SUCCESS;
@@ -897,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        }
        /*
+         * ppage: poisoned page
+         *   if p is regular page(4k page)
+         *        ppage == real poisoned page;
+         *   else p is hugetlb or THP, ppage == head page.
+         */
+        ppage = hpage;
+        if (PageTransHuge(hpage)) {
+                /*
+                 * Verify that this isn't a hugetlbfs head page, the check for
+                 * PageAnon is just for avoid tripping a split_huge_page
+                 * internal debug check, as split_huge_page refuses to deal with
+                 * anything that isn't an anon page. PageAnon can't go away fro
+                 * under us because we hold a refcount on the hpage, without a
+                 * refcount on the hpage. split_huge_page can't be safely called
+                 * in the first place, having a refcount on the tail isn't
+                 * enough * to be safe.
+                 */
+                if (!PageHuge(hpage) && PageAnon(hpage)) {
+                        if (unlikely(split_huge_page(hpage))) {
+                                /*
+                                 * FIXME: if splitting THP is failed, it is
+                                 * better to stop the following operation rather
+                                 * than causing panic by unmapping. System might
+                                 * survive if the page is freed later.
+                                 */
+                                printk(KERN_INFO
+                                        "MCE %#lx: failed to split THP\n", pfn);
+                                BUG_ON(!PageHWPoison(p));
+                                return SWAP_FAIL;
+                        }
+                        /* THP is split, so ppage should be the real poisoned page. */
+                        ppage = p;
+                }
+        }
+        /*
         * First collect all the processes that have the page
         * mapped in dirty form.  This has to be done before try_to_unmap,
         * because ttu takes the rmap data structures down.
@@ -905,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * there's nothing that can be done.
         */
        if (kill)
-                collect_procs(hpage, &tokill);
+                collect_procs(ppage, &tokill);
+        if (hpage != ppage)
+                lock_page_nosync(ppage);
-        ret = try_to_unmap(hpage, ttu);
+        ret = try_to_unmap(ppage, ttu);
        if (ret != SWAP_SUCCESS)
                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
-                                pfn, page_mapcount(hpage));
+                                pfn, page_mapcount(ppage));
+        if (hpage != ppage)
+                unlock_page(ppage);
        /*
         * Now that the dirty bit has been propagated to the
@@ -921,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-        kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
+        kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
                      ret != SWAP_SUCCESS, p, pfn);
        return ret;
@@ -1022,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
         */
-        if (!PageLRU(p) && !PageHuge(p))
+        if (!PageHuge(p) && !PageTransCompound(p)) {
-                shake_page(p, 0);
+                if (!PageLRU(p))
-        if (!PageLRU(p) && !PageHuge(p)) {
+                        shake_page(p, 0);
-                /*
+                if (!PageLRU(p)) {
-                 * shake_page could have turned it free.
+                        /*
-                 */
+                         * shake_page could have turned it free.
-                if (is_free_buddy_page(p)) {
+                         */
-                        action_result(pfn, "free buddy, 2nd try", DELAYED);
+                        if (is_free_buddy_page(p)) {
-                        return 0;
+                                action_result(pfn, "free buddy, 2nd try",
+                                                DELAYED);
+                                return 0;
+                        }
+                        action_result(pfn, "non LRU", IGNORED);
+                        put_page(p);
+                        return -EBUSY;
                }
-                action_result(pfn, "non LRU", IGNORED);
-                put_page(p);
-                return -EBUSY;
        }
        /*
@@ -1064,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * For error on the tail page, we should set PG_hwpoison
         * on the head page to show that the hugepage is hwpoisoned
         */
-        if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+        if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
                action_result(pfn, "hugepage already hardware poisoned",
                                IGNORED);
                unlock_page(hpage);
@@ -1295,7 +1341,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
                                true);
        if (ret) {
-                putback_lru_pages(&pagelist);
+                struct page *page1, *page2;
+                list_for_each_entry_safe(page1, page2, &pagelist, lru)
+                        put_page(page1);
                pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
                         pfn, ret, page->flags);
                if (ret > 0)
@@ -1419,6 +1468,7 @@ int soft_offline_page(struct page *page, int flags)
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
                                                                0, true);
                if (ret) {
+                        putback_lru_pages(&pagelist);
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
                        if (ret > 0)
diff --git a/mm/memory.c b/mm/memory.c
index 31250faff390..8e8c18324863 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2219,7 +2219,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                                         &ptl);
                        if (!pte_same(*page_table, orig_pte)) {
                                unlock_page(old_page);
-                                page_cache_release(old_page);
                                goto unlock;
                        }
                        page_cache_release(old_page);
@@ -2289,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                                         &ptl);
                        if (!pte_same(*page_table, orig_pte)) {
                                unlock_page(old_page);
-                                page_cache_release(old_page);
                                goto unlock;
                        }
@@ -2367,16 +2365,6 @@ gotten:
        }
        __SetPageUptodate(new_page);
-        /*
-         * Don't let another task, with possibly unlocked vma,
-         * keep the mlocked page.
-         */
-        if ((vma->vm_flags & VM_LOCKED) && old_page) {
-                lock_page(old_page);    /* for LRU manipulation */
-                clear_page_mlock(old_page);
-                unlock_page(old_page);
-        }
        if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
@@ -2444,10 +2432,20 @@ gotten:
        if (new_page)
                page_cache_release(new_page);
-        if (old_page)
-                page_cache_release(old_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
+        if (old_page) {
+                /*
+                 * Don't let another task, with possibly unlocked vma,
+                 * keep the mlocked page.
+                 */
+                if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+                        lock_page(old_page);    /* LRU manipulation */
+                        munlock_vma_page(old_page);
+                        unlock_page(old_page);
+                }
+                page_cache_release(old_page);
+        }
        return ret;
 oom_free_new:
        page_cache_release(new_page);
@@ -3053,12 +3051,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                goto out;
                        }
                        charged = 1;
-                        /*
-                         * Don't let another task, with possibly unlocked vma,
-                         * keep the mlocked page.
-                         */
-                        if (vma->vm_flags & VM_LOCKED)
-                                clear_page_mlock(vmf.page);
                        copy_user_highpage(page, vmf.page, address, vma);
                        __SetPageUptodate(page);
                } else {
diff --git a/mm/migrate.c b/mm/migrate.c
index 9f29a3b7aac2..766115253807 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -772,6 +772,7 @@ uncharge:
 unlock:
        unlock_page(page);
+move_newpage:
        if (rc != -EAGAIN) {
                /*
                 * A page that has been migrated has all references
@@ -785,8 +786,6 @@ unlock:
                putback_lru_page(page);
        }
-move_newpage:
        /*
         * Move the new page to the LRU. If migration was not successful
         * then this will free the page.
@@ -981,10 +980,6 @@ int migrate_huge_pages(struct list_head *from,
        }
        rc = 0;
 out:
-        list_for_each_entry_safe(page, page2, from, lru)
-                put_page(page);
        if (rc)
                return rc;
diff --git a/mm/mlock.c b/mm/mlock.c
index 13e81ee8be9d..c3924c7f00be 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -178,6 +178,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
                gup_flags |= FOLL_WRITE;
+        /*
+         * We want mlock to succeed for regions that have any permissions
+         * other than PROT_NONE.
+         */
+        if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+                gup_flags |= FOLL_FORCE;
        if (vma->vm_flags & VM_LOCKED)
                gup_flags |= FOLL_MLOCK;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 148c6e630df2..17497d0cd8b9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1882,12 +1882,12 @@ static void shrink_zone(int priority, struct zone *zone,
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list l;
-        unsigned long nr_reclaimed;
+        unsigned long nr_reclaimed, nr_scanned;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
-        unsigned long nr_scanned = sc->nr_scanned;
 restart:
        nr_reclaimed = 0;
+        nr_scanned = sc->nr_scanned;
        get_scan_count(zone, sc, nr, priority);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||