16 files changed, 366 insertions, 147 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e60837dc785c..33514d88fef9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -941,6 +941,37 @@ unlock:
        spin_unlock(ptl);
 }
+/*
+ * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
+ * during copy_user_huge_page()'s copy_page_rep(): in the case when
+ * the source page gets split and a tail freed before copy completes.
+ * Called under pmd_lock of checked pmd, so safe from splitting itself.
+ */
+static void get_user_huge_page(struct page *page)
+{
+        if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
+                struct page *endpage = page + HPAGE_PMD_NR;
+                atomic_add(HPAGE_PMD_NR, &page->_count);
+                while (++page < endpage)
+                        get_huge_page_tail(page);
+        } else {
+                get_page(page);
+        }
+}
+static void put_user_huge_page(struct page *page)
+{
+        if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
+                struct page *endpage = page + HPAGE_PMD_NR;
+                while (page < endpage)
+                        put_page(page++);
+        } else {
+                put_page(page);
+        }
+}
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address,
@@ -1074,7 +1105,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                ret |= VM_FAULT_WRITE;
                goto out_unlock;
        }
-        get_page(page);
+        get_user_huge_page(page);
        spin_unlock(ptl);
 alloc:
        if (transparent_hugepage_enabled(vma) &&
@@ -1095,7 +1126,7 @@ alloc:
                                split_huge_page(page);
                                ret |= VM_FAULT_FALLBACK;
                        }
-                        put_page(page);
+                        put_user_huge_page(page);
                }
                count_vm_event(THP_FAULT_FALLBACK);
                goto out;
@@ -1105,7 +1136,7 @@ alloc:
                put_page(new_page);
                if (page) {
                        split_huge_page(page);
-                        put_page(page);
+                        put_user_huge_page(page);
                } else
                        split_huge_page_pmd(vma, address, pmd);
                ret |= VM_FAULT_FALLBACK;
@@ -1127,7 +1158,7 @@ alloc:
        spin_lock(ptl);
        if (page)
-                put_page(page);
+                put_user_huge_page(page);
        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
                spin_unlock(ptl);
                mem_cgroup_uncharge_page(new_page);
@@ -2392,8 +2423,6 @@ static void collapse_huge_page(struct mm_struct *mm,
        pmd = mm_find_pmd(mm, address);
        if (!pmd)
                goto out;
-        if (pmd_trans_huge(*pmd))
-                goto out;
        anon_vma_lock_write(vma->anon_vma);
@@ -2492,8 +2521,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        pmd = mm_find_pmd(mm, address);
        if (!pmd)
                goto out;
-        if (pmd_trans_huge(*pmd))
-                goto out;
        memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2846,12 +2873,22 @@ void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
 static void split_huge_page_address(struct mm_struct *mm,
                                    unsigned long address)
 {
+        pgd_t *pgd;
+        pud_t *pud;
        pmd_t *pmd;
        VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
-        pmd = mm_find_pmd(mm, address);
+        pgd = pgd_offset(mm, address);
-        if (!pmd)
+        if (!pgd_present(*pgd))
+                return;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                return;
+        pmd = pmd_offset(pud, address);
+        if (!pmd_present(*pmd))
                return;
        /*
         * Caller holds the mmap_sem write mode, so a huge pmd cannot
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 226910cb7c9b..9221c02ed9e2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2520,6 +2520,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
                update_mmu_cache(vma, address, ptep);
 }
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+        swp_entry_t swp;
+        if (huge_pte_none(pte) || pte_present(pte))
+                return 0;
+        swp = pte_to_swp_entry(pte);
+        if (non_swap_entry(swp) && is_migration_entry(swp))
+                return 1;
+        else
+                return 0;
+}
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+        swp_entry_t swp;
+        if (huge_pte_none(pte) || pte_present(pte))
+                return 0;
+        swp = pte_to_swp_entry(pte);
+        if (non_swap_entry(swp) && is_hwpoison_entry(swp))
+                return 1;
+        else
+                return 0;
+}
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                            struct vm_area_struct *vma)
@@ -2559,7 +2584,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                dst_ptl = huge_pte_lock(h, dst, dst_pte);
                src_ptl = huge_pte_lockptr(h, src, src_pte);
                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
-                if (!huge_pte_none(huge_ptep_get(src_pte))) {
+                entry = huge_ptep_get(src_pte);
+                if (huge_pte_none(entry)) { /* skip none entry */
+                        ;
+                } else if (unlikely(is_hugetlb_entry_migration(entry) ||
+                                    is_hugetlb_entry_hwpoisoned(entry))) {
+                        swp_entry_t swp_entry = pte_to_swp_entry(entry);
+                        if (is_write_migration_entry(swp_entry) && cow) {
+                                /*
+                                 * COW mappings require pages in both
+                                 * parent and child to be set to read.
+                                 */
+                                make_migration_entry_read(&swp_entry);
+                                entry = swp_entry_to_pte(swp_entry);
+                                set_huge_pte_at(src, addr, src_pte, entry);
+                        }
+                        set_huge_pte_at(dst, addr, dst_pte, entry);
+                } else {
                        if (cow)
                                huge_ptep_set_wrprotect(src, addr, src_pte);
                        entry = huge_ptep_get(src_pte);
@@ -2578,32 +2620,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        return ret;
 }
-static int is_hugetlb_entry_migration(pte_t pte)
-{
-        swp_entry_t swp;
-        if (huge_pte_none(pte) || pte_present(pte))
-                return 0;
-        swp = pte_to_swp_entry(pte);
-        if (non_swap_entry(swp) && is_migration_entry(swp))
-                return 1;
-        else
-                return 0;
-}
-static int is_hugetlb_entry_hwpoisoned(pte_t pte)
-{
-        swp_entry_t swp;
-        if (huge_pte_none(pte) || pte_present(pte))
-                return 0;
-        swp = pte_to_swp_entry(pte);
-        if (non_swap_entry(swp) && is_hwpoison_entry(swp))
-                return 1;
-        else
-                return 0;
-}
 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                            unsigned long start, unsigned long end,
                            struct page *ref_page)
diff --git a/mm/ksm.c b/mm/ksm.c
index 68710e80994a..346ddc9e4c0d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -945,7 +945,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        pmd = mm_find_pmd(mm, addr);
        if (!pmd)
                goto out;
-        BUG_ON(pmd_trans_huge(*pmd));
        mmun_start = addr;
        mmun_end   = addr + PAGE_SIZE;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index cd8989c1027e..7211a73ba14d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -435,7 +435,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        if (av == NULL) /* Not actually mapped anymore */
                return;
-        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        pgoff = page_to_pgoff(page);
        read_lock(&tasklist_lock);
        for_each_process (tsk) {
                struct anon_vma_chain *vmac;
@@ -469,7 +469,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
        mutex_lock(&mapping->i_mmap_mutex);
        read_lock(&tasklist_lock);
        for_each_process(tsk) {
-                pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+                pgoff_t pgoff = page_to_pgoff(page);
                struct task_struct *t = task_early_kill(tsk, force_early);
                if (!t)
@@ -895,7 +895,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        struct page *hpage = *hpagep;
        struct page *ppage;
-        if (PageReserved(p) || PageSlab(p))
+        if (PageReserved(p) || PageSlab(p) || !PageLRU(p))
                return SWAP_SUCCESS;
        /*
@@ -1159,9 +1159,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                                        action_result(pfn, "free buddy, 2nd try", DELAYED);
                                return 0;
                        }
-                        action_result(pfn, "non LRU", IGNORED);
-                        put_page(p);
-                        return -EBUSY;
                }
        }
@@ -1194,6 +1191,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                return 0;
        }
+        if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
+                goto identify_page_state;
        /*
         * For error on the tail page, we should set PG_hwpoison
         * on the head page to show that the hugepage is hwpoisoned
@@ -1243,6 +1243,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                goto out;
        }
+identify_page_state:
        res = -EBUSY;
        /*
         * The first check uses the current page flags which may not have any
diff --git a/mm/memory.c b/mm/memory.c
index d67fd9fcf1f2..7e8d8205b610 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2882,7 +2882,8 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * if page by the offset is not ready to be mapped (cold cache or
         * something).
         */
-        if (vma->vm_ops->map_pages && fault_around_pages() > 1) {
+        if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
+            fault_around_pages() > 1) {
                pte = pte_offset_map_lock(mm, pmd, address, &ptl);
                do_fault_around(vma, address, pte, pgoff, flags);
                if (!pte_same(*pte, orig_pte))
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 284974230459..8f5330d74f47 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -656,19 +656,18 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
 * @nodes and @flags,) it's isolated and queued to the pagelist which is
 * passed via @private.)
 */
-static struct vm_area_struct *
+static int
 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                const nodemask_t *nodes, unsigned long flags, void *private)
 {
-        int err;
+        int err = 0;
-        struct vm_area_struct *first, *vma, *prev;
+        struct vm_area_struct *vma, *prev;
-        first = find_vma(mm, start);
+        vma = find_vma(mm, start);
-        if (!first)
+        if (!vma)
-                return ERR_PTR(-EFAULT);
+                return -EFAULT;
        prev = NULL;
-        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+        for (; vma && vma->vm_start < end; vma = vma->vm_next) {
                unsigned long endvma = vma->vm_end;
                if (endvma > end)
@@ -678,9 +677,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
                        if (!vma->vm_next && vma->vm_end < end)
-                                return ERR_PTR(-EFAULT);
+                                return -EFAULT;
                        if (prev && prev->vm_end < vma->vm_start)
-                                return ERR_PTR(-EFAULT);
+                                return -EFAULT;
                }
                if (flags & MPOL_MF_LAZY) {
@@ -694,15 +693,13 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                        err = queue_pages_pgd_range(vma, start, endvma, nodes,
                                                flags, private);
-                        if (err) {
+                        if (err)
-                                first = ERR_PTR(err);
                                break;
-                        }
                }
 next:
                prev = vma;
        }
-        return first;
+        return err;
 }
 /*
@@ -1156,16 +1153,17 @@ out:
 /*
 * Allocate a new page for page migration based on vma policy.
- * Start assuming that page is mapped by vma pointed to by @private.
+ * Start by assuming the page is mapped by the same vma as contains @start.
 * Search forward from there, if not.  N.B., this assumes that the
 * list of pages handed to migrate_pages()--which is how we get here--
 * is in virtual address order.
 */
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
 {
-        struct vm_area_struct *vma = (struct vm_area_struct *)private;
+        struct vm_area_struct *vma;
        unsigned long uninitialized_var(address);
+        vma = find_vma(current->mm, start);
        while (vma) {
                address = page_address_in_vma(page, vma);
                if (address != -EFAULT)
@@ -1195,7 +1193,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
        return -ENOSYS;
 }
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
 {
        return NULL;
 }
@@ -1205,7 +1203,6 @@ static long do_mbind(unsigned long start, unsigned long len,
                     unsigned short mode, unsigned short mode_flags,
                     nodemask_t *nmask, unsigned long flags)
 {
-        struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
        struct mempolicy *new;
        unsigned long end;
@@ -1271,11 +1268,9 @@ static long do_mbind(unsigned long start, unsigned long len,
        if (err)
                goto mpol_out;
-        vma = queue_pages_range(mm, start, end, nmask,
+        err = queue_pages_range(mm, start, end, nmask,
                          flags | MPOL_MF_INVERT, &pagelist);
+        if (!err)
-        err = PTR_ERR(vma);     /* maybe ... */
-        if (!IS_ERR(vma))
                err = mbind_range(mm, start, end, new);
        if (!err) {
@@ -1283,9 +1278,8 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!list_empty(&pagelist)) {
                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
-                        nr_failed = migrate_pages(&pagelist, new_vma_page,
+                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
-                                        NULL, (unsigned long)vma,
+                                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
-                                        MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
                        if (nr_failed)
                                putback_movable_pages(&pagelist);
                }
@@ -2145,7 +2139,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
        } else
                *new = *old;
-        rcu_read_lock();
        if (current_cpuset_is_being_rebound()) {
                nodemask_t mems = cpuset_mems_allowed(current);
                if (new->flags & MPOL_F_REBINDING)
@@ -2153,7 +2146,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
                else
                        mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
        }
-        rcu_read_unlock();
        atomic_set(&new->refcnt, 1);
        return new;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 63f0cd559999..be6dbf995c0c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -120,8 +120,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                pmd = mm_find_pmd(mm, addr);
                if (!pmd)
                        goto out;
-                if (pmd_trans_huge(*pmd))
-                        goto out;
                ptep = pte_offset_map(pmd, addr);
@@ -990,9 +988,10 @@ out:
         * it.  Otherwise, putback_lru_page() will drop the reference grabbed
         * during isolation.
         */
-        if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+        if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
+                ClearPageSwapBacked(newpage);
                put_new_page(newpage, private);
-        else
+        } else
                putback_lru_page(newpage);
        if (result) {
diff --git a/mm/msync.c b/mm/msync.c
index a5c673669ca6..992a1673d488 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -78,7 +78,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
                        goto out_unlock;
                }
                file = vma->vm_file;
-                fstart = start + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+                fstart = (start - vma->vm_start) +
+                         ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
                fend = fstart + (min(end, vma->vm_end) - start) - 1;
                start = vma->vm_end;
                if ((flags & MS_SYNC) && file &&
diff --git a/mm/nommu.c b/mm/nommu.c
index b78e3a8f5ee7..4a852f6c5709 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -786,7 +786,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
        for (i = 0; i < VMACACHE_SIZE; i++) {
                /* if the vma is cached, invalidate the entire cache */
                if (curr->vmacache[i] == vma) {
-                        vmacache_invalidate(curr->mm);
+                        vmacache_invalidate(mm);
                        break;
                }
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4f59fa29eda8..0ea758b898fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRACTION    (8)
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
@@ -815,9 +816,21 @@ void __init init_cma_reserved_pageblock(struct page *page)
                set_page_count(p, 0);
        } while (++p, --i);
-        set_page_refcounted(page);
        set_pageblock_migratetype(page, MIGRATE_CMA);
-        __free_pages(page, pageblock_order);
+        if (pageblock_order >= MAX_ORDER) {
+                i = pageblock_nr_pages;
+                p = page;
+                do {
+                        set_page_refcounted(p);
+                        __free_pages(p, MAX_ORDER - 1);
+                        p += MAX_ORDER_NR_PAGES;
+                } while (i -= MAX_ORDER_NR_PAGES);
+        } else {
+                set_page_refcounted(page);
+                __free_pages(page, pageblock_order);
+        }
        adjust_managed_page_count(page, pageblock_nr_pages);
 }
 #endif
@@ -4145,7 +4158,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
        memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
-static int __meminit zone_batchsize(struct zone *zone)
+static int zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
        int batch;
@@ -4261,8 +4274,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
        pageset_update(&p->pcp, high, batch);
 }
-static void __meminit pageset_set_high_and_batch(struct zone *zone,
+static void pageset_set_high_and_batch(struct zone *zone,
-                struct per_cpu_pageset *pcp)
+                                       struct per_cpu_pageset *pcp)
 {
        if (percpu_pagelist_fraction)
                pageset_set_high(pcp,
@@ -5881,23 +5894,38 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
        struct zone *zone;
-        unsigned int cpu;
+        int old_percpu_pagelist_fraction;
        int ret;
+        mutex_lock(&pcp_batch_high_lock);
+        old_percpu_pagelist_fraction = percpu_pagelist_fraction;
        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-        if (!write || (ret < 0))
+        if (!write || ret < 0)
-                return ret;
+                goto out;
+        /* Sanity checking to avoid pcp imbalance */
+        if (percpu_pagelist_fraction &&
+            percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
+                percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+                ret = -EINVAL;
+                goto out;
+        }
+        /* No change? */
+        if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+                goto out;
-        mutex_lock(&pcp_batch_high_lock);
        for_each_populated_zone(zone) {
-                unsigned long  high;
+                unsigned int cpu;
-                high = zone->managed_pages / percpu_pagelist_fraction;
                for_each_possible_cpu(cpu)
-                        pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
+                        pageset_set_high_and_batch(zone,
-                                         high);
+                                        per_cpu_ptr(zone->pageset, cpu));
        }
+out:
        mutex_unlock(&pcp_batch_high_lock);
-        return 0;
+        return ret;
 }
 int hashdist = HASHDIST_DEFAULT;
diff --git a/mm/rmap.c b/mm/rmap.c
index bf05fc872ae8..22a4a7699cdb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -517,11 +517,7 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
 static inline unsigned long
 __vma_address(struct page *page, struct vm_area_struct *vma)
 {
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        pgoff_t pgoff = page_to_pgoff(page);
-        if (unlikely(is_vm_hugetlb_page(vma)))
-                pgoff = page->index << huge_page_order(page_hstate(page));
        return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 }
@@ -569,6 +565,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd = NULL;
+        pmd_t pmde;
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
@@ -579,7 +576,13 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
                goto out;
        pmd = pmd_offset(pud, address);
-        if (!pmd_present(*pmd))
+        /*
+         * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
+         * without holding anon_vma lock for write.  So when looking for a
+         * genuine pmde (in which to find pte), test present and !THP together.
+         */
+        pmde = ACCESS_ONCE(*pmd);
+        if (!pmd_present(pmde) || pmd_trans_huge(pmde))
                pmd = NULL;
 out:
        return pmd;
@@ -615,9 +618,6 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
        if (!pmd)
                return NULL;
-        if (pmd_trans_huge(*pmd))
-                return NULL;
        pte = pte_offset_map(pmd, address);
        /* Make a quick check before getting the lock */
        if (!sync && !pte_present(*pte)) {
@@ -1635,7 +1635,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
 static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
 {
        struct anon_vma *anon_vma;
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        pgoff_t pgoff = page_to_pgoff(page);
        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
@@ -1676,7 +1676,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
 static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 {
        struct address_space *mapping = page->mapping;
-        pgoff_t pgoff = page->index << compound_order(page);
+        pgoff_t pgoff = page_to_pgoff(page);
        struct vm_area_struct *vma;
        int ret = SWAP_AGAIN;
diff --git a/mm/shmem.c b/mm/shmem.c
index f484c276e994..af68b15a8fc1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt;
 #define SHORT_SYMLINK_LEN 128
 /*
- * shmem_fallocate and shmem_writepage communicate via inode->i_private
+ * shmem_fallocate communicates with shmem_fault or shmem_writepage via
- * (with i_mutex making sure that it has only one user at a time):
+ * inode->i_private (with i_mutex making sure that it has only one user at
- * we would prefer not to enlarge the shmem inode just for that.
+ * a time): we would prefer not to enlarge the shmem inode just for that.
 */
 struct shmem_falloc {
+        wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
        pgoff_t start;          /* start of range currently being fallocated */
        pgoff_t next;           /* the next page offset to be fallocated */
        pgoff_t nr_falloced;    /* how many new pages have been fallocated */
@@ -467,23 +468,20 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                return;
        index = start;
-        for ( ; ; ) {
+        while (index < end) {
                cond_resched();
                pvec.nr = find_get_entries(mapping, index,
                                min(end - index, (pgoff_t)PAGEVEC_SIZE),
                                pvec.pages, indices);
                if (!pvec.nr) {
-                        if (index == start || unfalloc)
+                        /* If all gone or hole-punch or unfalloc, we're done */
+                        if (index == start || end != -1)
                                break;
+                        /* But if truncating, restart to make sure all gone */
                        index = start;
                        continue;
                }
-                if ((index == start || unfalloc) && indices[0] >= end) {
-                        pagevec_remove_exceptionals(&pvec);
-                        pagevec_release(&pvec);
-                        break;
-                }
                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
@@ -495,8 +493,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                        if (radix_tree_exceptional_entry(page)) {
                                if (unfalloc)
                                        continue;
-                                nr_swaps_freed += !shmem_free_swap(mapping,
+                                if (shmem_free_swap(mapping, index, page)) {
-                                                                index, page);
+                                        /* Swap was replaced by page: retry */
+                                        index--;
+                                        break;
+                                }
+                                nr_swaps_freed++;
                                continue;
                        }
@@ -505,6 +507,11 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                                if (page->mapping == mapping) {
                                        VM_BUG_ON_PAGE(PageWriteback(page), page);
                                        truncate_inode_page(mapping, page);
+                                } else {
+                                        /* Page was replaced by swap: retry */
+                                        unlock_page(page);
+                                        index--;
+                                        break;
                                }
                        }
                        unlock_page(page);
@@ -759,6 +766,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
                        spin_lock(&inode->i_lock);
                        shmem_falloc = inode->i_private;
                        if (shmem_falloc &&
+                            !shmem_falloc->waitq &&
                            index >= shmem_falloc->start &&
                            index < shmem_falloc->next)
                                shmem_falloc->nr_unswapped++;
@@ -1027,6 +1035,9 @@ repeat:
                goto failed;
        }
+        if (page && sgp == SGP_WRITE)
+                mark_page_accessed(page);
        /* fallocated page? */
        if (page && !PageUptodate(page)) {
                if (sgp != SGP_READ)
@@ -1108,6 +1119,9 @@ repeat:
                shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+                if (sgp == SGP_WRITE)
+                        mark_page_accessed(page);
                delete_from_swap_cache(page);
                set_page_dirty(page);
                swap_free(swap);
@@ -1134,6 +1148,9 @@ repeat:
                __SetPageSwapBacked(page);
                __set_page_locked(page);
+                if (sgp == SGP_WRITE)
+                        init_page_accessed(page);
                error = mem_cgroup_charge_file(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
                if (error)
@@ -1233,6 +1250,64 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        int error;
        int ret = VM_FAULT_LOCKED;
+        /*
+         * Trinity finds that probing a hole which tmpfs is punching can
+         * prevent the hole-punch from ever completing: which in turn
+         * locks writers out with its hold on i_mutex.  So refrain from
+         * faulting pages into the hole while it's being punched.  Although
+         * shmem_undo_range() does remove the additions, it may be unable to
+         * keep up, as each new page needs its own unmap_mapping_range() call,
+         * and the i_mmap tree grows ever slower to scan if new vmas are added.
+         *
+         * It does not matter if we sometimes reach this check just before the
+         * hole-punch begins, so that one fault then races with the punch:
+         * we just need to make racing faults a rare case.
+         *
+         * The implementation below would be much simpler if we just used a
+         * standard mutex or completion: but we cannot take i_mutex in fault,
+         * and bloating every shmem inode for this unlikely case would be sad.
+         */
+        if (unlikely(inode->i_private)) {
+                struct shmem_falloc *shmem_falloc;
+                spin_lock(&inode->i_lock);
+                shmem_falloc = inode->i_private;
+                if (shmem_falloc &&
+                    shmem_falloc->waitq &&
+                    vmf->pgoff >= shmem_falloc->start &&
+                    vmf->pgoff < shmem_falloc->next) {
+                        wait_queue_head_t *shmem_falloc_waitq;
+                        DEFINE_WAIT(shmem_fault_wait);
+                        ret = VM_FAULT_NOPAGE;
+                        if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
+                           !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+                                /* It's polite to up mmap_sem if we can */
+                                up_read(&vma->vm_mm->mmap_sem);
+                                ret = VM_FAULT_RETRY;
+                        }
+                        shmem_falloc_waitq = shmem_falloc->waitq;
+                        prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
+                                        TASK_UNINTERRUPTIBLE);
+                        spin_unlock(&inode->i_lock);
+                        schedule();
+                        /*
+                         * shmem_falloc_waitq points into the shmem_fallocate()
+                         * stack of the hole-punching task: shmem_falloc_waitq
+                         * is usually invalid by the time we reach here, but
+                         * finish_wait() does not dereference it in that case;
+                         * though i_lock needed lest racing with wake_up_all().
+                         */
+                        spin_lock(&inode->i_lock);
+                        finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
+                        spin_unlock(&inode->i_lock);
+                        return ret;
+                }
+                spin_unlock(&inode->i_lock);
+        }
        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
        if (error)
                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1372,13 +1447,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
-        int ret;
        struct inode *inode = mapping->host;
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+        return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
-        if (ret == 0 && *pagep)
-                init_page_accessed(*pagep);
-        return ret;
 }
 static int
@@ -1724,18 +1795,34 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
        pgoff_t start, index, end;
        int error;
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+                return -EOPNOTSUPP;
        mutex_lock(&inode->i_mutex);
        if (mode & FALLOC_FL_PUNCH_HOLE) {
                struct address_space *mapping = file->f_mapping;
                loff_t unmap_start = round_up(offset, PAGE_SIZE);
                loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+                DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
+                shmem_falloc.waitq = &shmem_falloc_waitq;
+                shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+                shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
+                spin_lock(&inode->i_lock);
+                inode->i_private = &shmem_falloc;
+                spin_unlock(&inode->i_lock);
                if ((u64)unmap_end > (u64)unmap_start)
                        unmap_mapping_range(mapping, unmap_start,
                                            1 + unmap_end - unmap_start, 0);
                shmem_truncate_range(inode, offset, offset + len - 1);
                /* No need to unmap again: hole-punching leaves COWed pages */
+                spin_lock(&inode->i_lock);
+                inode->i_private = NULL;
+                wake_up_all(&shmem_falloc_waitq);
+                spin_unlock(&inode->i_lock);
                error = 0;
                goto out;
        }
@@ -1753,6 +1840,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                goto out;
        }
+        shmem_falloc.waitq = NULL;
        shmem_falloc.start = start;
        shmem_falloc.next  = start;
        shmem_falloc.nr_falloced = 0;
diff --git a/mm/slab.c b/mm/slab.c
index 9ca3b87edabc..3070b929a1bf 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -386,6 +386,39 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #endif
+#define OBJECT_FREE (0)
+#define OBJECT_ACTIVE (1)
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+static void set_obj_status(struct page *page, int idx, int val)
+{
+        int freelist_size;
+        char *status;
+        struct kmem_cache *cachep = page->slab_cache;
+        freelist_size = cachep->num * sizeof(freelist_idx_t);
+        status = (char *)page->freelist + freelist_size;
+        status[idx] = val;
+}
+static inline unsigned int get_obj_status(struct page *page, int idx)
+{
+        int freelist_size;
+        char *status;
+        struct kmem_cache *cachep = page->slab_cache;
+        freelist_size = cachep->num * sizeof(freelist_idx_t);
+        status = (char *)page->freelist + freelist_size;
+        return status[idx];
+}
+#else
+static inline void set_obj_status(struct page *page, int idx, int val) {}
+#endif
 /*
 * Do not go above this order unless 0 objects fit into the slab or
 * overridden on the command line.
@@ -576,12 +609,30 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
        return cachep->array[smp_processor_id()];
 }
+static size_t calculate_freelist_size(int nr_objs, size_t align)
+{
+        size_t freelist_size;
+        freelist_size = nr_objs * sizeof(freelist_idx_t);
+        if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+                freelist_size += nr_objs * sizeof(char);
+        if (align)
+                freelist_size = ALIGN(freelist_size, align);
+        return freelist_size;
+}
 static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
                                size_t idx_size, size_t align)
 {
        int nr_objs;
+        size_t remained_size;
        size_t freelist_size;
+        int extra_space = 0;
+        if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+                extra_space = sizeof(char);
        /*
         * Ignore padding for the initial guess. The padding
         * is at most @align-1 bytes, and @buffer_size is at
@@ -590,14 +641,15 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
         * into the memory allocation when taking the padding
         * into account.
         */
-        nr_objs = slab_size / (buffer_size + idx_size);
+        nr_objs = slab_size / (buffer_size + idx_size + extra_space);
        /*
         * This calculated number will be either the right
         * amount, or one greater than what we want.
         */
-        freelist_size = slab_size - nr_objs * buffer_size;
+        remained_size = slab_size - nr_objs * buffer_size;
-        if (freelist_size < ALIGN(nr_objs * idx_size, align))
+        freelist_size = calculate_freelist_size(nr_objs, align);
+        if (remained_size < freelist_size)
                nr_objs--;
        return nr_objs;
@@ -635,7 +687,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
        } else {
                nr_objs = calculate_nr_objs(slab_size, buffer_size,
                                        sizeof(freelist_idx_t), align);
-                mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align);
+                mgmt_size = calculate_freelist_size(nr_objs, align);
        }
        *num = nr_objs;
        *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
@@ -2041,13 +2093,16 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
                        break;
                if (flags & CFLGS_OFF_SLAB) {
+                        size_t freelist_size_per_obj = sizeof(freelist_idx_t);
                        /*
                         * Max number of objs-per-slab for caches which
                         * use off-slab slabs. Needed to avoid a possible
                         * looping condition in cache_grow().
                         */
+                        if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+                                freelist_size_per_obj += sizeof(char);
                        offslab_limit = size;
-                        offslab_limit /= sizeof(freelist_idx_t);
+                        offslab_limit /= freelist_size_per_obj;
                        if (num > offslab_limit)
                                break;
@@ -2294,8 +2349,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
        if (!cachep->num)
                return -E2BIG;
-        freelist_size =
+        freelist_size = calculate_freelist_size(cachep->num, cachep->align);
-                ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align);
        /*
         * If the slab has been placed off-slab, and we have enough space then
@@ -2308,7 +2362,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
        if (flags & CFLGS_OFF_SLAB) {
                /* really off slab. No need for manual alignment */
-                freelist_size = cachep->num * sizeof(freelist_idx_t);
+                freelist_size = calculate_freelist_size(cachep->num, 0);
 #ifdef CONFIG_PAGE_POISONING
                /* If we're going to use the generic kernel_map_pages()
@@ -2612,6 +2666,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                if (cachep->ctor)
                        cachep->ctor(objp);
 #endif
+                set_obj_status(page, i, OBJECT_FREE);
                set_free_obj(page, i, i);
        }
 }
@@ -2820,6 +2875,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        BUG_ON(objnr >= cachep->num);
        BUG_ON(objp != index_to_obj(cachep, page, objnr));
+        set_obj_status(page, objnr, OBJECT_FREE);
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
                if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2953,6 +3009,8 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                                gfp_t flags, void *objp, unsigned long caller)
 {
+        struct page *page;
        if (!objp)
                return objp;
        if (cachep->flags & SLAB_POISON) {
@@ -2983,6 +3041,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
        }
+        page = virt_to_head_page(objp);
+        set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON)
                cachep->ctor(objp);
@@ -4219,21 +4280,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
                                                struct page *page)
 {
        void *p;
-        int i, j;
+        int i;
        if (n[0] == n[1])
                return;
        for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
-                bool active = true;
+                if (get_obj_status(page, i) != OBJECT_ACTIVE)
-                for (j = page->active; j < c->num; j++) {
-                        /* Skip freed item */
-                        if (get_free_obj(page, j) == i) {
-                                active = false;
-                                break;
-                        }
-                }
-                if (!active)
                        continue;
                if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 735e01a0db6f..d31c4bacc6a2 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -55,7 +55,7 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
                        continue;
                }
-#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
+#if !defined(CONFIG_SLUB)
                if (!strcmp(s->name, name)) {
                        pr_err("%s (%s): Cache name already exists.\n",
                               __func__, name);
diff --git a/mm/slub.c b/mm/slub.c
index b2b047327d76..73004808537e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1881,7 +1881,7 @@ redo:
        new.frozen = 0;
-        if (!new.inuse && n->nr_partial > s->min_partial)
+        if (!new.inuse && n->nr_partial >= s->min_partial)
                m = M_FREE;
        else if (new.freelist) {
                m = M_PARTIAL;
@@ -1992,7 +1992,7 @@ static void unfreeze_partials(struct kmem_cache *s,
                                new.freelist, new.counters,
                                "unfreezing slab"));
-                if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
+                if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
                        page->next = discard_page;
                        discard_page = page;
                } else {
@@ -2620,7 +2620,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                return;
        }
-        if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
+        if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
                goto slab_empty;
        /*
diff --git a/mm/truncate.c b/mm/truncate.c
index 6a78c814bebf..eda247307164 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -355,14 +355,16 @@ void truncate_inode_pages_range(struct address_space *mapping,
        for ( ; ; ) {
                cond_resched();
                if (!pagevec_lookup_entries(&pvec, mapping, index,
-                        min(end - index, (pgoff_t)PAGEVEC_SIZE),
+                        min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
-                        indices)) {
+                        /* If all gone from start onwards, we're done */
                        if (index == start)
                                break;
+                        /* Otherwise restart to make sure all gone */
                        index = start;
                        continue;
                }
                if (index == start && indices[0] >= end) {
+                        /* All gone out of hole to be punched, we're done */
                        pagevec_remove_exceptionals(&pvec);
                        pagevec_release(&pvec);
                        break;
@@ -373,8 +375,11 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        /* We rely upon deletion not changing page->index */
                        index = indices[i];
-                        if (index >= end)
+                        if (index >= end) {
+                                /* Restart punch to make sure all gone */
+                                index = start - 1;
                                break;
+                        }
                        if (radix_tree_exceptional_entry(page)) {
                                clear_exceptional_entry(mapping, index, page);