13 files changed, 151 insertions, 112 deletions
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 538367ef1372..1b24bdcb3197 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -319,7 +319,7 @@ void __frontswap_invalidate_area(unsigned type)
                        return;
                frontswap_ops->invalidate_area(type);
                atomic_set(&sis->frontswap_pages, 0);
-                memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+                bitmap_zero(sis->frontswap_map, sis->max);
        }
        clear_bit(type, need_init);
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 03a89a2f464b..362c329b83fe 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2325,7 +2325,12 @@ static void collapse_huge_page(struct mm_struct *mm,
                pte_unmap(pte);
                spin_lock(&mm->page_table_lock);
                BUG_ON(!pmd_none(*pmd));
-                set_pmd_at(mm, address, pmd, _pmd);
+                /*
+                 * We can only use set_pmd_at when establishing
+                 * hugepmds and never for establishing regular pmds that
+                 * points to regular pagetables. Use pmd_populate for that
+                 */
+                pmd_populate(mm, pmd, pmd_pgtable(_pmd));
                spin_unlock(&mm->page_table_lock);
                anon_vma_unlock_write(vma->anon_vma);
                goto out;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f8feeeca6686..e2bfbf73a551 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2839,7 +2839,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (ptep) {
                entry = huge_ptep_get(ptep);
                if (unlikely(is_hugetlb_entry_migration(entry))) {
-                        migration_entry_wait(mm, (pmd_t *)ptep, address);
+                        migration_entry_wait_huge(mm, ptep);
                        return 0;
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                        return VM_FAULT_HWPOISON_LARGE |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cb1c9dedf9b6..194721839cf5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1199,7 +1199,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                        mz = mem_cgroup_zoneinfo(root, nid, zid);
                        iter = &mz->reclaim_iter[reclaim->priority];
-                        last_visited = iter->last_visited;
                        if (prev && reclaim->generation != iter->generation) {
                                iter->last_visited = NULL;
                                goto out_unlock;
@@ -1218,13 +1217,12 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                         * is alive.
                         */
                        dead_count = atomic_read(&root->dead_count);
-                        smp_rmb();
+                        if (dead_count == iter->last_dead_count) {
-                        last_visited = iter->last_visited;
+                                smp_rmb();
-                        if (last_visited) {
+                                last_visited = iter->last_visited;
-                                if ((dead_count != iter->last_dead_count) ||
+                                if (last_visited &&
-                                        !css_tryget(&last_visited->css)) {
+                                    !css_tryget(&last_visited->css))
                                        last_visited = NULL;
-                                }
                        }
                }
@@ -3141,8 +3139,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
                        return -ENOMEM;
                }
-                INIT_WORK(&s->memcg_params->destroy,
-                                kmem_cache_destroy_work_func);
                s->memcg_params->is_root_cache = true;
                /*
@@ -4108,8 +4104,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
        if (mem_cgroup_disabled())
                return NULL;
-        VM_BUG_ON(PageSwapCache(page));
        if (PageTransHuge(page)) {
                nr_pages <<= compound_order(page);
                VM_BUG_ON(!PageTransHuge(page));
@@ -4205,6 +4199,18 @@ void mem_cgroup_uncharge_page(struct page *page)
        if (page_mapped(page))
                return;
        VM_BUG_ON(page->mapping && !PageAnon(page));
+        /*
+         * If the page is in swap cache, uncharge should be deferred
+         * to the swap path, which also properly accounts swap usage
+         * and handles memcg lifetime.
+         *
+         * Note that this check is not stable and reclaim may add the
+         * page to swap cache at any time after this.  However, if the
+         * page is not in swap cache by the time page->mapcount hits
+         * 0, there won't be any page table references to the swap
+         * slot, and reclaim will free it and not actually write the
+         * page to disk.
+         */
        if (PageSwapCache(page))
                return;
        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
diff --git a/mm/memory.c b/mm/memory.c
index 6dc1882fbd72..61a262b08e53 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -220,7 +220,6 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
        tlb->start      = -1UL;
        tlb->end        = 0;
        tlb->need_flush = 0;
-        tlb->fast_mode  = (num_possible_cpus() == 1);
        tlb->local.next = NULL;
        tlb->local.nr   = 0;
        tlb->local.max  = ARRAY_SIZE(tlb->__pages);
@@ -244,9 +243,6 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
        tlb_table_flush(tlb);
 #endif
-        if (tlb_fast_mode(tlb))
-                return;
        for (batch = &tlb->local; batch; batch = batch->next) {
                free_pages_and_swap_cache(batch->pages, batch->nr);
                batch->nr = 0;
@@ -288,11 +284,6 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
        VM_BUG_ON(!tlb->need_flush);
-        if (tlb_fast_mode(tlb)) {
-                free_page_and_swap_cache(page);
-                return 1; /* avoid calling tlb_flush_mmu() */
-        }
        batch = tlb->active;
        batch->pages[batch->nr++] = page;
        if (batch->nr == batch->max) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a221fac1f47d..1ad92b46753e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -720,9 +720,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
        start = phys_start_pfn << PAGE_SHIFT;
        size = nr_pages * PAGE_SIZE;
        ret = release_mem_region_adjustable(&iomem_resource, start, size);
-        if (ret)
+        if (ret) {
-                pr_warn("Unable to release resource <%016llx-%016llx> (%d)\n",
+                resource_size_t endres = start + size - 1;
-                                start, start + size - 1, ret);
+                pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
+                                &start, &endres, ret);
+        }
        sections_to_remove = nr_pages / PAGES_PER_SECTION;
        for (i = 0; i < sections_to_remove; i++) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 27ed22579fd9..6f0c24438bba 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -165,7 +165,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                pte = arch_make_huge_pte(pte, vma, new, 0);
        }
 #endif
-        flush_cache_page(vma, addr, pte_pfn(pte));
+        flush_dcache_page(new);
        set_pte_at(mm, addr, ptep, pte);
        if (PageHuge(new)) {
@@ -200,15 +200,14 @@ static void remove_migration_ptes(struct page *old, struct page *new)
 * get to the page and wait until migration is finished.
 * When we return from this function the fault will be retried.
 */
-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
-                                unsigned long address)
+                                spinlock_t *ptl)
 {
-        pte_t *ptep, pte;
+        pte_t pte;
-        spinlock_t *ptl;
        swp_entry_t entry;
        struct page *page;
-        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+        spin_lock(ptl);
        pte = *ptep;
        if (!is_swap_pte(pte))
                goto out;
@@ -236,6 +235,20 @@ out:
        pte_unmap_unlock(ptep, ptl);
 }
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+                                unsigned long address)
+{
+        spinlock_t *ptl = pte_lockptr(mm, pmd);
+        pte_t *ptep = pte_offset_map(pmd, address);
+        __migration_entry_wait(mm, ptep, ptl);
+}
+void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
+{
+        spinlock_t *ptl = &(mm)->page_table_lock;
+        __migration_entry_wait(mm, pte, ptl);
+}
 #ifdef CONFIG_BLOCK
 /* Returns true if all buffers are successfully locked */
 static bool buffer_migrate_lock_buffers(struct buffer_head *head,
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index be04122fb277..6725ff183374 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -40,48 +40,44 @@ void __mmu_notifier_release(struct mm_struct *mm)
        int id;
        /*
-         * srcu_read_lock() here will block synchronize_srcu() in
+         * SRCU here will block mmu_notifier_unregister until
-         * mmu_notifier_unregister() until all registered
+         * ->release returns.
-         * ->release() callouts this function makes have
-         * returned.
         */
        id = srcu_read_lock(&srcu);
+        hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
+                /*
+                 * If ->release runs before mmu_notifier_unregister it must be
+                 * handled, as it's the only way for the driver to flush all
+                 * existing sptes and stop the driver from establishing any more
+                 * sptes before all the pages in the mm are freed.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+        srcu_read_unlock(&srcu, id);
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
                mn = hlist_entry(mm->mmu_notifier_mm->list.first,
                                 struct mmu_notifier,
                                 hlist);
                /*
-                 * Unlink.  This will prevent mmu_notifier_unregister()
+                 * We arrived before mmu_notifier_unregister so
-                 * from also making the ->release() callout.
+                 * mmu_notifier_unregister will do nothing other than to wait
+                 * for ->release to finish and for mmu_notifier_unregister to
+                 * return.
                 */
                hlist_del_init_rcu(&mn->hlist);
-                spin_unlock(&mm->mmu_notifier_mm->lock);
-                /*
-                 * Clear sptes. (see 'release' description in mmu_notifier.h)
-                 */
-                if (mn->ops->release)
-                        mn->ops->release(mn, mm);
-                spin_lock(&mm->mmu_notifier_mm->lock);
        }
        spin_unlock(&mm->mmu_notifier_mm->lock);
        /*
-         * All callouts to ->release() which we have done are complete.
+         * synchronize_srcu here prevents mmu_notifier_release from returning to
-         * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
+         * exit_mmap (which would proceed with freeing all pages in the mm)
-         */
+         * until the ->release method returns, if it was invoked by
-        srcu_read_unlock(&srcu, id);
+         * mmu_notifier_unregister.
+         *
-        /*
+         * The mmu_notifier_mm can't go away from under us because one mm_count
-         * mmu_notifier_unregister() may have unlinked a notifier and may
+         * is held by exit_mmap.
-         * still be calling out to it.  Additionally, other notifiers
-         * may have been active via vmtruncate() et. al. Block here
-         * to ensure that all notifier callouts for this mm have been
-         * completed and the sptes are really cleaned up before returning
-         * to exit_mmap().
         */
        synchronize_srcu(&srcu);
 }
@@ -292,31 +288,34 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
        BUG_ON(atomic_read(&mm->mm_count) <= 0);
-        spin_lock(&mm->mmu_notifier_mm->lock);
        if (!hlist_unhashed(&mn->hlist)) {
+                /*
+                 * SRCU here will force exit_mmap to wait for ->release to
+                 * finish before freeing the pages.
+                 */
                int id;
+                id = srcu_read_lock(&srcu);
                /*
-                 * Ensure we synchronize up with __mmu_notifier_release().
+                 * exit_mmap will block in mmu_notifier_release to guarantee
+                 * that ->release is called before freeing the pages.
                 */
-                id = srcu_read_lock(&srcu);
-                hlist_del_rcu(&mn->hlist);
-                spin_unlock(&mm->mmu_notifier_mm->lock);
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
+                srcu_read_unlock(&srcu, id);
+                spin_lock(&mm->mmu_notifier_mm->lock);
                /*
-                 * Allow __mmu_notifier_release() to complete.
+                 * Can not use list_del_rcu() since __mmu_notifier_release
+                 * can delete it before we hold the lock.
                 */
-                srcu_read_unlock(&srcu, id);
+                hlist_del_init_rcu(&mn->hlist);
-        } else
                spin_unlock(&mm->mmu_notifier_mm->lock);
+        }
        /*
-         * Wait for any running method to finish, including ->release() if it
+         * Wait for any running method to finish, of course including
-         * was run by __mmu_notifier_release() instead of us.
+         * ->release if it was run by mmu_notifier_relase instead of us.
         */
        synchronize_srcu(&srcu);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 98cbdf6e5532..c3edb624fccf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1628,6 +1628,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        long min = mark;
        long lowmem_reserve = z->lowmem_reserve[classzone_idx];
        int o;
+        long free_cma = 0;
        free_pages -= (1 << order) - 1;
        if (alloc_flags & ALLOC_HIGH)
@@ -1637,9 +1638,10 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 #ifdef CONFIG_CMA
        /* If allocation can't use CMA areas don't use free CMA pages */
        if (!(alloc_flags & ALLOC_CMA))
-                free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
+                free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
 #endif
-        if (free_pages <= min + lowmem_reserve)
+        if (free_pages - free_cma <= min + lowmem_reserve)
                return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
@@ -5158,7 +5160,7 @@ unsigned long free_reserved_area(unsigned long start, unsigned long end,
        for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {
                if (poison)
                        memset((void *)pos, poison, PAGE_SIZE);
-                free_reserved_page(virt_to_page(pos));
+                free_reserved_page(virt_to_page((void *)pos));
        }
        if (pages && s)
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 35aa294656cd..5da2cbcfdbb5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -127,28 +127,7 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
        return 0;
 }
-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
-{
-        struct vm_area_struct *vma;
-        /* We don't need vma lookup at all. */
-        if (!walk->hugetlb_entry)
-                return NULL;
-        VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
-        vma = find_vma(walk->mm, addr);
-        if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
-                return vma;
-        return NULL;
-}
 #else /* CONFIG_HUGETLB_PAGE */
-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
-{
-        return NULL;
-}
 static int walk_hugetlb_range(struct vm_area_struct *vma,
                              unsigned long addr, unsigned long end,
                              struct mm_walk *walk)
@@ -198,30 +177,53 @@ int walk_page_range(unsigned long addr, unsigned long end,
        if (!walk->mm)
                return -EINVAL;
+        VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
        pgd = pgd_offset(walk->mm, addr);
        do {
-                struct vm_area_struct *vma;
+                struct vm_area_struct *vma = NULL;
                next = pgd_addr_end(addr, end);
                /*
-                 * handle hugetlb vma individually because pagetable walk for
+                 * This function was not intended to be vma based.
-                 * the hugetlb page is dependent on the architecture and
+                 * But there are vma special cases to be handled:
-                 * we can't handled it in the same manner as non-huge pages.
+                 * - hugetlb vma's
+                 * - VM_PFNMAP vma's
                 */
-                vma = hugetlb_vma(addr, walk);
+                vma = find_vma(walk->mm, addr);
                if (vma) {
-                        if (vma->vm_end < next)
+                        /*
+                         * There are no page structures backing a VM_PFNMAP
+                         * range, so do not allow split_huge_page_pmd().
+                         */
+                        if ((vma->vm_start <= addr) &&
+                            (vma->vm_flags & VM_PFNMAP)) {
                                next = vma->vm_end;
+                                pgd = pgd_offset(walk->mm, next);
+                                continue;
+                        }
                        /*
-                         * Hugepage is very tightly coupled with vma, so
+                         * Handle hugetlb vma individually because pagetable
-                         * walk through hugetlb entries within a given vma.
+                         * walk for the hugetlb page is dependent on the
+                         * architecture and we can't handled it in the same
+                         * manner as non-huge pages.
                         */
-                        err = walk_hugetlb_range(vma, addr, next, walk);
+                        if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
-                        if (err)
+                            is_vm_hugetlb_page(vma)) {
-                                break;
+                                if (vma->vm_end < next)
-                        pgd = pgd_offset(walk->mm, next);
+                                        next = vma->vm_end;
-                        continue;
+                                /*
+                                 * Hugepage is very tightly coupled with vma,
+                                 * so walk through hugetlb entries within a
+                                 * given vma.
+                                 */
+                                err = walk_hugetlb_range(vma, addr, next, walk);
+                                if (err)
+                                        break;
+                                pgd = pgd_offset(walk->mm, next);
+                                continue;
+                        }
                }
                if (pgd_none_or_clear_bad(pgd)) {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index ff3218a0f5e1..2d414508e9ec 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -373,8 +373,10 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
 {
        int index;
-        if (WARN_ON_ONCE(size > KMALLOC_MAX_SIZE))
+        if (size > KMALLOC_MAX_SIZE) {
+                WARN_ON_ONCE(!(flags & __GFP_NOWARN));
                return NULL;
+        }
        if (size <= 192) {
                if (!size)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b3d40dcf3624..f24ab0dff554 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -336,8 +336,24 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                 * Swap entry may have been freed since our caller observed it.
                 */
                err = swapcache_prepare(entry);
-                if (err == -EEXIST) {   /* seems racy */
+                if (err == -EEXIST) {
                        radix_tree_preload_end();
+                        /*
+                         * We might race against get_swap_page() and stumble
+                         * across a SWAP_HAS_CACHE swap_map entry whose page
+                         * has not been brought into the swapcache yet, while
+                         * the other end is scheduled away waiting on discard
+                         * I/O completion at scan_swap_map().
+                         *
+                         * In order to avoid turning this transitory state
+                         * into a permanent loop around this -EEXIST case
+                         * if !CONFIG_PREEMPT and the I/O completion happens
+                         * to be waiting on the CPU waitqueue where we are now
+                         * busy looping, we just conditionally invoke the
+                         * scheduler here, if there are some more important
+                         * tasks to run.
+                         */
+                        cond_resched();
                        continue;
                }
                if (err) {              /* swp entry is obsolete ? */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6c340d908b27..746af55b8455 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2116,7 +2116,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        }
        /* frontswap enabled? set up bit-per-page map for frontswap */
        if (frontswap_enabled)
-                frontswap_map = vzalloc(maxpages / sizeof(long));
+                frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
        if (p->bdev) {
                if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {