18 files changed, 190 insertions, 95 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 2d9f1504d75e..2888024e0b0a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -575,5 +575,5 @@ config PGTABLE_MAPPING
          then you should select this. This causes zsmalloc to use page table
          mapping rather than copying for object mapping.
-          You can check speed with zsmalloc benchmark[1].
+          You can check speed with zsmalloc benchmark:
-          [1] https://github.com/spartacus06/zsmalloc
+          https://github.com/spartacus06/zsmapbench
diff --git a/mm/compaction.c b/mm/compaction.c
index b48c5259ea33..918577595ea8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -251,7 +251,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 {
        int nr_scanned = 0, total_isolated = 0;
        struct page *cursor, *valid_page = NULL;
-        unsigned long nr_strict_required = end_pfn - blockpfn;
        unsigned long flags;
        bool locked = false;
@@ -264,11 +263,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
                nr_scanned++;
                if (!pfn_valid_within(blockpfn))
-                        continue;
+                        goto isolate_fail;
                if (!valid_page)
                        valid_page = page;
                if (!PageBuddy(page))
-                        continue;
+                        goto isolate_fail;
                /*
                 * The zone lock must be held to isolate freepages.
@@ -289,12 +289,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
                /* Recheck this is a buddy page under lock */
                if (!PageBuddy(page))
-                        continue;
+                        goto isolate_fail;
                /* Found a free page, break it into order-0 pages */
                isolated = split_free_page(page);
-                if (!isolated && strict)
-                        break;
                total_isolated += isolated;
                for (i = 0; i < isolated; i++) {
                        list_add(&page->lru, freelist);
@@ -305,7 +303,15 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
                if (isolated) {
                        blockpfn += isolated - 1;
                        cursor += isolated - 1;
+                        continue;
                }
+isolate_fail:
+                if (strict)
+                        break;
+                else
+                        continue;
        }
        trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
@@ -315,7 +321,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
         * pages requested were isolated. If there were any failures, 0 is
         * returned and CMA will fail.
         */
-        if (strict && nr_strict_required > total_isolated)
+        if (strict && blockpfn < end_pfn)
                total_isolated = 0;
        if (locked)
diff --git a/mm/filemap.c b/mm/filemap.c
index d56d3c145b9f..7a13f6ac5421 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2553,8 +2553,8 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        if (ret > 0) {
                ssize_t err;
-                err = generic_write_sync(file, pos, ret);
+                err = generic_write_sync(file, iocb->ki_pos - ret, ret);
-                if (err < 0 && ret > 0)
+                if (err < 0)
                        ret = err;
        }
        return ret;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 82166bf974e1..1546655a2d78 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1166,8 +1166,10 @@ alloc:
                } else {
                        ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
                                        pmd, orig_pmd, page, haddr);
-                        if (ret & VM_FAULT_OOM)
+                        if (ret & VM_FAULT_OOM) {
                                split_huge_page(page);
+                                ret |= VM_FAULT_FALLBACK;
+                        }
                        put_page(page);
                }
                count_vm_event(THP_FAULT_FALLBACK);
@@ -1179,9 +1181,10 @@ alloc:
                if (page) {
                        split_huge_page(page);
                        put_page(page);
-                }
+                } else
+                        split_huge_page_pmd(vma, address, pmd);
+                ret |= VM_FAULT_FALLBACK;
                count_vm_event(THP_FAULT_FALLBACK);
-                ret |= VM_FAULT_OOM;
                goto out;
        }
@@ -1545,6 +1548,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                                entry = pmd_mknonnuma(entry);
                        entry = pmd_modify(entry, newprot);
                        ret = HPAGE_PMD_NR;
+                        set_pmd_at(mm, addr, pmd, entry);
                        BUG_ON(pmd_write(entry));
                } else {
                        struct page *page = pmd_page(*pmd);
@@ -1557,16 +1561,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                         */
                        if (!is_huge_zero_page(page) &&
                            !pmd_numa(*pmd)) {
-                                entry = *pmd;
+                                pmdp_set_numa(mm, addr, pmd);
-                                entry = pmd_mknuma(entry);
                                ret = HPAGE_PMD_NR;
                        }
                }
-                /* Set PMD if cleared earlier */
-                if (ret == HPAGE_PMD_NR)
-                        set_pmd_at(mm, addr, pmd, entry);
                spin_unlock(ptl);
        }
@@ -1963,7 +1961,7 @@ out:
        return ret;
 }
-#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
+#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
 int hugepage_madvise(struct vm_area_struct *vma,
                     unsigned long *vm_flags, int advice)
diff --git a/mm/ksm.c b/mm/ksm.c
index aa4c7c7250c1..68710e80994a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item)
 static struct page *page_trans_compound_anon(struct page *page)
 {
        if (PageTransCompound(page)) {
-                struct page *head = compound_trans_head(page);
+                struct page *head = compound_head(page);
                /*
                 * head may actually be splitted and freed from under
                 * us but it's ok here.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53385cd4e6f0..5b6b0039f725 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1127,8 +1127,8 @@ skip_node:
         * skipping css reference should be safe.
         */
        if (next_css) {
-                if ((next_css->flags & CSS_ONLINE) &&
+                if ((next_css == &root->css) ||
-                                (next_css == &root->css || css_tryget(next_css)))
+                    ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))
                        return mem_cgroup_from_css(next_css);
                prev_css = next_css;
@@ -1687,7 +1687,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
         * protects memcg_name and makes sure that parallel ooms do not
         * interleave
         */
-        static DEFINE_SPINLOCK(oom_info_lock);
+        static DEFINE_MUTEX(oom_info_lock);
        struct cgroup *task_cgrp;
        struct cgroup *mem_cgrp;
        static char memcg_name[PATH_MAX];
@@ -1698,7 +1698,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
        if (!p)
                return;
-        spin_lock(&oom_info_lock);
+        mutex_lock(&oom_info_lock);
        rcu_read_lock();
        mem_cgrp = memcg->css.cgroup;
@@ -1767,7 +1767,7 @@ done:
                pr_cont("\n");
        }
-        spin_unlock(&oom_info_lock);
+        mutex_unlock(&oom_info_lock);
 }
 /*
@@ -6595,6 +6595,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup_event *event, *tmp;
+        struct cgroup_subsys_state *iter;
        /*
         * Unregister events and notify userspace.
@@ -6611,7 +6612,14 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        kmem_cgroup_css_offline(memcg);
        mem_cgroup_invalidate_reclaim_iterators(memcg);
-        mem_cgroup_reparent_charges(memcg);
+        /*
+         * This requires that offlining is serialized.  Right now that is
+         * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
+         */
+        css_for_each_descendant_post(iter, css)
+                mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
        mem_cgroup_destroy_all_caches(memcg);
        vmpressure_cleanup(&memcg->vmpressure);
 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4f08a2d61487..90002ea43638 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -945,8 +945,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
                         * to it. Similarly, page lock is shifted.
                         */
                        if (hpage != p) {
-                                put_page(hpage);
+                                if (!(flags & MF_COUNT_INCREASED)) {
-                                get_page(p);
+                                        put_page(hpage);
+                                        get_page(p);
+                                }
                                lock_page(p);
                                unlock_page(hpage);
                                *hpagep = p;
@@ -1649,7 +1651,7 @@ int soft_offline_page(struct page *page, int flags)
 {
        int ret;
        unsigned long pfn = page_to_pfn(page);
-        struct page *hpage = compound_trans_head(page);
+        struct page *hpage = compound_head(page);
        if (PageHWPoison(page)) {
                pr_info("soft offline: %#lx page already poisoned\n", pfn);
diff --git a/mm/memory.c b/mm/memory.c
index be6a0c0d4ae0..22dfa617bddb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3348,6 +3348,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                if (ret & VM_FAULT_LOCKED)
                        unlock_page(vmf.page);
                ret = VM_FAULT_HWPOISON;
+                page_cache_release(vmf.page);
                goto uncharge_out;
        }
@@ -3703,7 +3704,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(is_vm_hugetlb_page(vma)))
                return hugetlb_fault(mm, vma, address, flags);
-retry:
        pgd = pgd_offset(mm, address);
        pud = pud_alloc(mm, pgd, address);
        if (!pud)
@@ -3741,20 +3741,13 @@ retry:
                        if (dirty && !pmd_write(orig_pmd)) {
                                ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
                                                          orig_pmd);
-                                /*
+                                if (!(ret & VM_FAULT_FALLBACK))
-                                 * If COW results in an oom, the huge pmd will
+                                        return ret;
-                                 * have been split, so retry the fault on the
-                                 * pte for a smaller charge.
-                                 */
-                                if (unlikely(ret & VM_FAULT_OOM))
-                                        goto retry;
-                                return ret;
                        } else {
                                huge_pmd_set_accessed(mm, vma, address, pmd,
                                                      orig_pmd, dirty);
+                                return 0;
                        }
-                        return 0;
                }
        }
diff --git a/mm/migrate.c b/mm/migrate.c
index 482a33d89134..b494fdb9a636 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1158,7 +1158,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
                                        pm->node);
        else
                return alloc_pages_exact_node(pm->node,
-                                GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
+                                GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
 }
 /*
@@ -1544,9 +1544,9 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
        struct page *newpage;
        newpage = alloc_pages_exact_node(nid,
-                                         (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
+                                         (GFP_HIGHUSER_MOVABLE |
-                                          __GFP_NOMEMALLOC | __GFP_NORETRY |
+                                          __GFP_THISNODE | __GFP_NOMEMALLOC |
-                                          __GFP_NOWARN) &
+                                          __GFP_NORETRY | __GFP_NOWARN) &
                                         ~GFP_IOFS, 0);
        return newpage;
@@ -1747,7 +1747,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                goto out_dropref;
        new_page = alloc_pages_node(node,
-                (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+                (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT,
+                HPAGE_PMD_ORDER);
        if (!new_page)
                goto out_fail;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 7332c1785744..769a67a15803 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -58,36 +58,27 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                if (pte_numa(ptent))
                                        ptent = pte_mknonnuma(ptent);
                                ptent = pte_modify(ptent, newprot);
+                                /*
+                                 * Avoid taking write faults for pages we
+                                 * know to be dirty.
+                                 */
+                                if (dirty_accountable && pte_dirty(ptent))
+                                        ptent = pte_mkwrite(ptent);
+                                ptep_modify_prot_commit(mm, addr, pte, ptent);
                                updated = true;
                        } else {
                                struct page *page;
-                                ptent = *pte;
                                page = vm_normal_page(vma, addr, oldpte);
                                if (page && !PageKsm(page)) {
                                        if (!pte_numa(oldpte)) {
-                                                ptent = pte_mknuma(ptent);
+                                                ptep_set_numa(mm, addr, pte);
-                                                set_pte_at(mm, addr, pte, ptent);
                                                updated = true;
                                        }
                                }
                        }
-                        /*
-                         * Avoid taking write faults for pages we know to be
-                         * dirty.
-                         */
-                        if (dirty_accountable && pte_dirty(ptent)) {
-                                ptent = pte_mkwrite(ptent);
-                                updated = true;
-                        }
                        if (updated)
                                pages++;
-                        /* Only !prot_numa always clears the pte */
-                        if (!prot_numa)
-                                ptep_modify_prot_commit(mm, addr, pte, ptent);
                } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2d30e2cfe804..7106cb1aca8e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2173,11 +2173,12 @@ int __set_page_dirty_nobuffers(struct page *page)
        if (!TestSetPageDirty(page)) {
                struct address_space *mapping = page_mapping(page);
                struct address_space *mapping2;
+                unsigned long flags;
                if (!mapping)
                        return 1;
-                spin_lock_irq(&mapping->tree_lock);
+                spin_lock_irqsave(&mapping->tree_lock, flags);
                mapping2 = page_mapping(page);
                if (mapping2) { /* Race with truncate? */
                        BUG_ON(mapping2 != mapping);
@@ -2186,7 +2187,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                        radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
                }
-                spin_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
                if (mapping->host) {
                        /* !PageAnon && !swapper_space */
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e3758a09a009..3bac76ae4b30 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -369,9 +369,11 @@ void prep_compound_page(struct page *page, unsigned long order)
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
-                __SetPageTail(p);
                set_page_count(p, 0);
                p->first_page = page;
+                /* Make sure p->first_page is always valid for PageTail() */
+                smp_wmb();
+                __SetPageTail(p);
        }
 }
@@ -1236,6 +1238,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
        }
        local_irq_restore(flags);
 }
+static bool gfp_thisnode_allocation(gfp_t gfp_mask)
+{
+        return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
+}
+#else
+static bool gfp_thisnode_allocation(gfp_t gfp_mask)
+{
+        return false;
+}
 #endif
 /*
@@ -1572,7 +1583,13 @@ again:
                                          get_pageblock_migratetype(page));
        }
-        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+        /*
+         * NOTE: GFP_THISNODE allocations do not partake in the kswapd
+         * aging protocol, so they can't be fair.
+         */
+        if (!gfp_thisnode_allocation(gfp_flags))
+                __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
        zone_statistics(preferred_zone, zone, gfp_flags);
        local_irq_restore(flags);
@@ -1944,8 +1961,12 @@ zonelist_scan:
                 * ultimately fall back to remote zones that do not
                 * partake in the fairness round-robin cycle of this
                 * zonelist.
+                 *
+                 * NOTE: GFP_THISNODE allocations do not partake in
+                 * the kswapd aging protocol, so they can't be fair.
                 */
-                if (alloc_flags & ALLOC_WMARK_LOW) {
+                if ((alloc_flags & ALLOC_WMARK_LOW) &&
+                    !gfp_thisnode_allocation(gfp_mask)) {
                        if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
                                continue;
                        if (!zone_local(preferred_zone, zone))
@@ -2501,8 +2522,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         * allowed per node queues are empty and that nodes are
         * over allocated.
         */
-        if (IS_ENABLED(CONFIG_NUMA) &&
+        if (gfp_thisnode_allocation(gfp_mask))
-                        (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;
 restart:
diff --git a/mm/slub.c b/mm/slub.c
index 7e3e0458bce4..25f14ad8f817 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1004,21 +1004,19 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
 static void add_full(struct kmem_cache *s,
        struct kmem_cache_node *n, struct page *page)
 {
-        lockdep_assert_held(&n->list_lock);
        if (!(s->flags & SLAB_STORE_USER))
                return;
+        lockdep_assert_held(&n->list_lock);
        list_add(&page->lru, &n->full);
 }
 static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
 {
-        lockdep_assert_held(&n->list_lock);
        if (!(s->flags & SLAB_STORE_USER))
                return;
+        lockdep_assert_held(&n->list_lock);
        list_del(&page->lru);
 }
@@ -1520,11 +1518,9 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
 /*
 * Management of partially allocated slabs.
 */
-static inline void add_partial(struct kmem_cache_node *n,
+static inline void
-                                struct page *page, int tail)
+__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
 {
-        lockdep_assert_held(&n->list_lock);
        n->nr_partial++;
        if (tail == DEACTIVATE_TO_TAIL)
                list_add_tail(&page->lru, &n->partial);
@@ -1532,15 +1528,27 @@ static inline void add_partial(struct kmem_cache_node *n,
                list_add(&page->lru, &n->partial);
 }
-static inline void remove_partial(struct kmem_cache_node *n,
+static inline void add_partial(struct kmem_cache_node *n,
-                                        struct page *page)
+                                struct page *page, int tail)
 {
        lockdep_assert_held(&n->list_lock);
+        __add_partial(n, page, tail);
+}
+static inline void
+__remove_partial(struct kmem_cache_node *n, struct page *page)
+{
        list_del(&page->lru);
        n->nr_partial--;
 }
+static inline void remove_partial(struct kmem_cache_node *n,
+                                        struct page *page)
+{
+        lockdep_assert_held(&n->list_lock);
+        __remove_partial(n, page);
+}
 /*
 * Remove slab from the partial list, freeze it and
 * return the pointer to the freelist.
@@ -2906,12 +2914,10 @@ static void early_kmem_cache_node_alloc(int node)
        inc_slabs_node(kmem_cache_node, node, page->objects);
        /*
-         * the lock is for lockdep's sake, not for any actual
+         * No locks need to be taken here as it has just been
-         * race protection
+         * initialized and there is no concurrent access.
         */
-        spin_lock(&n->list_lock);
+        __add_partial(n, page, DEACTIVATE_TO_HEAD);
-        add_partial(n, page, DEACTIVATE_TO_HEAD);
-        spin_unlock(&n->list_lock);
 }
 static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -3197,7 +3203,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
        list_for_each_entry_safe(page, h, &n->partial, lru) {
                if (!page->inuse) {
-                        remove_partial(n, page);
+                        __remove_partial(n, page);
                        discard_slab(s, page);
                } else {
                        list_slab_objects(s, page,
diff --git a/mm/swap.c b/mm/swap.c
index b31ba67d440a..0092097b3f4c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -98,7 +98,7 @@ static void put_compound_page(struct page *page)
        }
        /* __split_huge_page_refcount can run under us */
-        page_head = compound_trans_head(page);
+        page_head = compound_head(page);
        /*
         * THP can not break up slab pages so avoid taking
@@ -253,7 +253,7 @@ bool __get_page_tail(struct page *page)
         */
        unsigned long flags;
        bool got;
-        struct page *page_head = compound_trans_head(page);
+        struct page *page_head = compound_head(page);
        /* Ref to put_compound_page() comment. */
        if (!__compound_tail_refcounted(page_head)) {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 98e85e9c2b2d..e76ace30d436 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void)
        return ret;
 }
+static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
 void show_swap_cache_info(void)
 {
        printk("%lu pages in swap cache\n", total_swapcache_pages());
@@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry)
        page = find_get_page(swap_address_space(entry), entry.val);
-        if (page)
+        if (page) {
                INC_CACHE_INFO(find_success);
+                if (TestClearPageReadahead(page))
+                        atomic_inc(&swapin_readahead_hits);
+        }
        INC_CACHE_INFO(find_total);
        return page;
@@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
        return found_page;
 }
+static unsigned long swapin_nr_pages(unsigned long offset)
+{
+        static unsigned long prev_offset;
+        unsigned int pages, max_pages, last_ra;
+        static atomic_t last_readahead_pages;
+        max_pages = 1 << ACCESS_ONCE(page_cluster);
+        if (max_pages <= 1)
+                return 1;
+        /*
+         * This heuristic has been found to work well on both sequential and
+         * random loads, swapping to hard disk or to SSD: please don't ask
+         * what the "+ 2" means, it just happens to work well, that's all.
+         */
+        pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
+        if (pages == 2) {
+                /*
+                 * We can have no readahead hits to judge by: but must not get
+                 * stuck here forever, so check for an adjacent offset instead
+                 * (and don't even bother to check whether swap type is same).
+                 */
+                if (offset != prev_offset + 1 && offset != prev_offset - 1)
+                        pages = 1;
+                prev_offset = offset;
+        } else {
+                unsigned int roundup = 4;
+                while (roundup < pages)
+                        roundup <<= 1;
+                pages = roundup;
+        }
+        if (pages > max_pages)
+                pages = max_pages;
+        /* Don't shrink readahead too fast */
+        last_ra = atomic_read(&last_readahead_pages) / 2;
+        if (pages < last_ra)
+                pages = last_ra;
+        atomic_set(&last_readahead_pages, pages);
+        return pages;
+}
 /**
 * swapin_readahead - swap in pages in hope we need them soon
 * @entry: swap entry of this memory
@@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
                        struct vm_area_struct *vma, unsigned long addr)
 {
        struct page *page;
-        unsigned long offset = swp_offset(entry);
+        unsigned long entry_offset = swp_offset(entry);
+        unsigned long offset = entry_offset;
        unsigned long start_offset, end_offset;
-        unsigned long mask = (1UL << page_cluster) - 1;
+        unsigned long mask;
        struct blk_plug plug;
+        mask = swapin_nr_pages(offset) - 1;
+        if (!mask)
+                goto skip;
        /* Read a page_cluster sized and aligned cluster around offset. */
        start_offset = offset & ~mask;
        end_offset = offset | mask;
@@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
                                                gfp_mask, vma, addr);
                if (!page)
                        continue;
+                if (offset != entry_offset)
+                        SetPageReadahead(page);
                page_cache_release(page);
        }
        blk_finish_plug(&plug);
        lru_add_drain();        /* Push any new pages onto the LRU now */
+skip:
        return read_swap_cache_async(entry, gfp_mask, vma, addr);
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c6c13b050a58..4a7f7e6992b6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1923,7 +1923,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        p->swap_map = NULL;
        cluster_info = p->cluster_info;
        p->cluster_info = NULL;
-        p->flags = 0;
        frontswap_map = frontswap_map_get(p);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
@@ -1949,6 +1948,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                mutex_unlock(&inode->i_mutex);
        }
        filp_close(swap_file, NULL);
+        /*
+         * Clear the SWP_USED flag after all resources are freed so that swapon
+         * can reuse this swap_info in alloc_swap_info() safely.  It is ok to
+         * not hold p->lock after we cleared its SWP_WRITEOK.
+         */
+        spin_lock(&swap_lock);
+        p->flags = 0;
+        spin_unlock(&swap_lock);
        err = 0;
        atomic_inc(&proc_poll_event);
        wake_up_interruptible(&proc_poll_wait);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 196970a4541f..d4042e75f7c7 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -19,6 +19,7 @@
 #include <linux/mm.h>
 #include <linux/vmstat.h>
 #include <linux/eventfd.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/printk.h>
 #include <linux/vmpressure.h>
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 72496140ac08..def5dd2fbe61 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -851,12 +851,14 @@ const char * const vmstat_text[] = {
        "thp_zero_page_alloc",
        "thp_zero_page_alloc_failed",
 #endif
+#ifdef CONFIG_DEBUG_TLBFLUSH
 #ifdef CONFIG_SMP
        "nr_tlb_remote_flush",
        "nr_tlb_remote_flush_received",
-#endif
+#endif /* CONFIG_SMP */
        "nr_tlb_local_flush_all",
        "nr_tlb_local_flush_one",
+#endif /* CONFIG_DEBUG_TLBFLUSH */
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
 };