14 files changed, 204 insertions, 72 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index eb69f352401d..723bbe04a0b0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -543,7 +543,7 @@ config ZSWAP
 config MEM_SOFT_DIRTY
        bool "Track memory changes"
-        depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
+        depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
        select PROC_PAGE_MONITOR
        help
          This option enables memory changes tracking by introducing a
diff --git a/mm/compaction.c b/mm/compaction.c
index 805165bcd3dd..f58bcd016f43 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -134,6 +134,10 @@ static void update_pageblock_skip(struct compact_control *cc,
                        bool migrate_scanner)
 {
        struct zone *zone = cc->zone;
+        if (cc->ignore_skip_hint)
+                return;
        if (!page)
                return;
diff --git a/mm/fremap.c b/mm/fremap.c
index 5bff08147768..bbc4d660221a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -208,9 +208,10 @@ get_write_lock:
                if (mapping_cap_account_dirty(mapping)) {
                        unsigned long addr;
                        struct file *file = get_file(vma->vm_file);
+                        /* mmap_region may free vma; grab the info now */
+                        vm_flags = vma->vm_flags;
-                        addr = mmap_region(file, start, size,
+                        addr = mmap_region(file, start, size, vm_flags, pgoff);
-                                        vma->vm_flags, pgoff);
                        fput(file);
                        if (IS_ERR_VALUE(addr)) {
                                err = addr;
@@ -218,7 +219,7 @@ get_write_lock:
                                BUG_ON(addr != start);
                                err = 0;
                        }
-                        goto out;
+                        goto out_freed;
                }
                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
@@ -253,6 +254,7 @@ get_write_lock:
 out:
        if (vma)
                vm_flags = vma->vm_flags;
+out_freed:
        if (likely(!has_write_lock))
                up_read(&mm->mmap_sem);
        else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 33a5dc492810..95d1acb0f3d2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -882,6 +882,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                ret = 0;
                goto out_unlock;
        }
        if (unlikely(pmd_trans_splitting(pmd))) {
                /* split huge page running from under us */
                spin_unlock(src_ptl);
@@ -1153,7 +1154,7 @@ alloc:
                new_page = NULL;
        if (unlikely(!new_page)) {
-                if (is_huge_zero_pmd(orig_pmd)) {
+                if (!page) {
                        ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
                                        address, pmd, orig_pmd, haddr);
                } else {
@@ -1180,7 +1181,7 @@ alloc:
        count_vm_event(THP_FAULT_ALLOC);
-        if (is_huge_zero_pmd(orig_pmd))
+        if (!page)
                clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
        else
                copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
@@ -1206,7 +1207,7 @@ alloc:
                page_add_new_anon_rmap(new_page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
                update_mmu_cache_pmd(vma, address, pmd);
-                if (is_huge_zero_pmd(orig_pmd)) {
+                if (!page) {
                        add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
                        put_huge_zero_page();
                } else {
@@ -1243,6 +1244,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
        if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
                return ERR_PTR(-EFAULT);
+        /* Full NUMA hinting faults to serialise migration in fault paths */
+        if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+                goto out;
        page = pmd_page(*pmd);
        VM_BUG_ON(!PageHead(page));
        if (flags & FOLL_TOUCH) {
@@ -1295,6 +1300,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(!pmd_same(pmd, *pmdp)))
                goto out_unlock;
+        /*
+         * If there are potential migrations, wait for completion and retry
+         * without disrupting NUMA hinting information. Do not relock and
+         * check_same as the page may no longer be mapped.
+         */
+        if (unlikely(pmd_trans_migrating(*pmdp))) {
+                spin_unlock(ptl);
+                wait_migrate_huge_page(vma->anon_vma, pmdp);
+                goto out;
+        }
        page = pmd_page(pmd);
        BUG_ON(is_huge_zero_page(page));
        page_nid = page_to_nid(page);
@@ -1323,23 +1339,22 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                /* If the page was locked, there are no parallel migrations */
                if (page_locked)
                        goto clear_pmdnuma;
+        }
-                /*
+        /* Migration could have started since the pmd_trans_migrating check */
-                 * Otherwise wait for potential migrations and retry. We do
+        if (!page_locked) {
-                 * relock and check_same as the page may no longer be mapped.
-                 * As the fault is being retried, do not account for it.
-                 */
                spin_unlock(ptl);
                wait_on_page_locked(page);
                page_nid = -1;
                goto out;
        }
-        /* Page is misplaced, serialise migrations and parallel THP splits */
+        /*
+         * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
+         * to serialises splits
+         */
        get_page(page);
        spin_unlock(ptl);
-        if (!page_locked)
-                lock_page(page);
        anon_vma = page_lock_anon_vma_read(page);
        /* Confirm the PMD did not change while page_table_lock was released */
@@ -1351,6 +1366,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out_unlock;
        }
+        /* Bail if we fail to protect against THP splits for any reason */
+        if (unlikely(!anon_vma)) {
+                put_page(page);
+                page_nid = -1;
+                goto clear_pmdnuma;
+        }
        /*
         * Migrate the THP to the requested node, returns with page unlocked
         * and pmd_numa cleared.
@@ -1517,6 +1539,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                ret = 1;
                if (!prot_numa) {
                        entry = pmdp_get_and_clear(mm, addr, pmd);
+                        if (pmd_numa(entry))
+                                entry = pmd_mknonnuma(entry);
                        entry = pmd_modify(entry, newprot);
                        ret = HPAGE_PMD_NR;
                        BUG_ON(pmd_write(entry));
@@ -1531,7 +1555,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                         */
                        if (!is_huge_zero_page(page) &&
                            !pmd_numa(*pmd)) {
-                                entry = pmdp_get_and_clear(mm, addr, pmd);
+                                entry = *pmd;
                                entry = pmd_mknuma(entry);
                                ret = HPAGE_PMD_NR;
                        }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bf5e89457149..7f1a356153c0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -338,7 +338,7 @@ struct mem_cgroup {
 static size_t memcg_size(void)
 {
        return sizeof(struct mem_cgroup) +
-                nr_node_ids * sizeof(struct mem_cgroup_per_node);
+                nr_node_ids * sizeof(struct mem_cgroup_per_node *);
 }
 /* internal only representation about the status of kmem accounting. */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b7c171602ba1..fabe55046c1d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -938,6 +938,16 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
                                BUG_ON(!PageHWPoison(p));
                                return SWAP_FAIL;
                        }
+                        /*
+                         * We pinned the head page for hwpoison handling,
+                         * now we split the thp and we are interested in
+                         * the hwpoisoned raw page, so move the refcount
+                         * to it.
+                         */
+                        if (hpage != p) {
+                                put_page(hpage);
+                                get_page(p);
+                        }
                        /* THP is split, so ppage should be the real poisoned page. */
                        ppage = p;
                }
@@ -1505,10 +1515,16 @@ static int soft_offline_huge_page(struct page *page, int flags)
                if (ret > 0)
                        ret = -EIO;
        } else {
-                set_page_hwpoison_huge_page(hpage);
+                /* overcommit hugetlb page will be freed to buddy */
-                dequeue_hwpoisoned_huge_page(hpage);
+                if (PageHuge(page)) {
-                atomic_long_add(1 << compound_order(hpage),
+                        set_page_hwpoison_huge_page(hpage);
-                                &num_poisoned_pages);
+                        dequeue_hwpoisoned_huge_page(hpage);
+                        atomic_long_add(1 << compound_order(hpage),
+                                        &num_poisoned_pages);
+                } else {
+                        SetPageHWPoison(page);
+                        atomic_long_inc(&num_poisoned_pages);
+                }
        }
        return ret;
 }
diff --git a/mm/memory.c b/mm/memory.c
index 5d9025f3b3e1..6768ce9e57d2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4271,7 +4271,7 @@ void copy_user_huge_page(struct page *dst, struct page *src,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
-#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS
+#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
 bool ptlock_alloc(struct page *page)
 {
        spinlock_t *ptl;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eca4a3129129..0cd2c4d4e270 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1197,14 +1197,16 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
                        break;
                vma = vma->vm_next;
        }
+        if (PageHuge(page)) {
+                if (vma)
+                        return alloc_huge_page_noerr(vma, address, 1);
+                else
+                        return NULL;
+        }
        /*
-         * queue_pages_range() confirms that @page belongs to some vma,
+         * if !vma, alloc_page_vma() will use task or system default policy
-         * so vma shouldn't be NULL.
         */
-        BUG_ON(!vma);
-        if (PageHuge(page))
-                return alloc_huge_page_noerr(vma, address, 1);
        return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 }
 #else
@@ -1318,7 +1320,7 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
        } else
-                putback_lru_pages(&pagelist);
+                putback_movable_pages(&pagelist);
        up_write(&mm->mmap_sem);
 mpol_out:
diff --git a/mm/migrate.c b/mm/migrate.c
index bb940045fe85..9194375b2307 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,6 +36,7 @@
 #include <linux/hugetlb_cgroup.h>
 #include <linux/gfp.h>
 #include <linux/balloon_compaction.h>
+#include <linux/mmu_notifier.h>
 #include <asm/tlbflush.h>
@@ -316,14 +317,15 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
 */
 int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page,
-                struct buffer_head *head, enum migrate_mode mode)
+                struct buffer_head *head, enum migrate_mode mode,
+                int extra_count)
 {
-        int expected_count = 0;
+        int expected_count = 1 + extra_count;
        void **pslot;
        if (!mapping) {
                /* Anonymous page without mapping */
-                if (page_count(page) != 1)
+                if (page_count(page) != expected_count)
                        return -EAGAIN;
                return MIGRATEPAGE_SUCCESS;
        }
@@ -333,7 +335,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
        pslot = radix_tree_lookup_slot(&mapping->page_tree,
                                        page_index(page));
-        expected_count = 2 + page_has_private(page);
+        expected_count += 1 + page_has_private(page);
        if (page_count(page) != expected_count ||
                radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
                spin_unlock_irq(&mapping->tree_lock);
@@ -583,7 +585,7 @@ int migrate_page(struct address_space *mapping,
        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
-        rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
+        rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
        if (rc != MIGRATEPAGE_SUCCESS)
                return rc;
@@ -610,7 +612,7 @@ int buffer_migrate_page(struct address_space *mapping,
        head = page_buffers(page);
-        rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
+        rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
        if (rc != MIGRATEPAGE_SUCCESS)
                return rc;
@@ -1654,6 +1656,18 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
        return 1;
 }
+bool pmd_trans_migrating(pmd_t pmd)
+{
+        struct page *page = pmd_page(pmd);
+        return PageLocked(page);
+}
+void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
+{
+        struct page *page = pmd_page(*pmd);
+        wait_on_page_locked(page);
+}
 /*
 * Attempt to migrate a misplaced page to the specified destination
 * node. Caller is expected to have an elevated reference count on
@@ -1716,12 +1730,14 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                                struct page *page, int node)
 {
        spinlock_t *ptl;
-        unsigned long haddr = address & HPAGE_PMD_MASK;
        pg_data_t *pgdat = NODE_DATA(node);
        int isolated = 0;
        struct page *new_page = NULL;
        struct mem_cgroup *memcg = NULL;
        int page_lru = page_is_file_cache(page);
+        unsigned long mmun_start = address & HPAGE_PMD_MASK;
+        unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
+        pmd_t orig_entry;
        /*
         * Rate-limit the amount of data that is being migrated to a node.
@@ -1744,6 +1760,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                goto out_fail;
        }
+        if (mm_tlb_flush_pending(mm))
+                flush_tlb_range(vma, mmun_start, mmun_end);
        /* Prepare a page as a migration target */
        __set_page_locked(new_page);
        SetPageSwapBacked(new_page);
@@ -1755,9 +1774,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
        WARN_ON(PageLRU(new_page));
        /* Recheck the target PMD */
+        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        ptl = pmd_lock(mm, pmd);
-        if (unlikely(!pmd_same(*pmd, entry))) {
+        if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
+fail_putback:
                spin_unlock(ptl);
+                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
                /* Reverse changes made by migrate_page_copy() */
                if (TestClearPageActive(new_page))
@@ -1774,7 +1796,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                putback_lru_page(page);
                mod_zone_page_state(page_zone(page),
                         NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
-                goto out_fail;
+                goto out_unlock;
        }
        /*
@@ -1786,16 +1809,35 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
         */
        mem_cgroup_prepare_migration(page, new_page, &memcg);
+        orig_entry = *pmd;
        entry = mk_pmd(new_page, vma->vm_page_prot);
-        entry = pmd_mknonnuma(entry);
-        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
        entry = pmd_mkhuge(entry);
+        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-        pmdp_clear_flush(vma, haddr, pmd);
+        /*
-        set_pmd_at(mm, haddr, pmd, entry);
+         * Clear the old entry under pagetable lock and establish the new PTE.
-        page_add_new_anon_rmap(new_page, vma, haddr);
+         * Any parallel GUP will either observe the old page blocking on the
+         * page lock, block on the page table lock or observe the new page.
+         * The SetPageUptodate on the new page and page_add_new_anon_rmap
+         * guarantee the copy is visible before the pagetable update.
+         */
+        flush_cache_range(vma, mmun_start, mmun_end);
+        page_add_new_anon_rmap(new_page, vma, mmun_start);
+        pmdp_clear_flush(vma, mmun_start, pmd);
+        set_pmd_at(mm, mmun_start, pmd, entry);
+        flush_tlb_range(vma, mmun_start, mmun_end);
        update_mmu_cache_pmd(vma, address, &entry);
+        if (page_count(page) != 2) {
+                set_pmd_at(mm, mmun_start, pmd, orig_entry);
+                flush_tlb_range(vma, mmun_start, mmun_end);
+                update_mmu_cache_pmd(vma, address, &entry);
+                page_remove_rmap(new_page);
+                goto fail_putback;
+        }
        page_remove_rmap(page);
        /*
         * Finish the charge transaction under the page table lock to
         * prevent split_huge_page() from dividing up the charge
@@ -1803,6 +1845,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
         */
        mem_cgroup_end_migration(memcg, page, new_page, true);
        spin_unlock(ptl);
+        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        unlock_page(new_page);
        unlock_page(page);
@@ -1820,10 +1863,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 out_fail:
        count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
 out_dropref:
-        entry = pmd_mknonnuma(entry);
+        ptl = pmd_lock(mm, pmd);
-        set_pmd_at(mm, haddr, pmd, entry);
+        if (pmd_same(*pmd, entry)) {
-        update_mmu_cache_pmd(vma, address, &entry);
+                entry = pmd_mknonnuma(entry);
+                set_pmd_at(mm, mmun_start, pmd, entry);
+                update_mmu_cache_pmd(vma, address, &entry);
+        }
+        spin_unlock(ptl);
+out_unlock:
        unlock_page(page);
        put_page(page);
        return 0;
diff --git a/mm/mlock.c b/mm/mlock.c
index d480cd6fc475..192e6eebe4f2 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -133,7 +133,10 @@ static void __munlock_isolation_failed(struct page *page)
 /**
 * munlock_vma_page - munlock a vma page
- * @page - page to be unlocked
+ * @page - page to be unlocked, either a normal page or THP page head
+ *
+ * returns the size of the page as a page mask (0 for normal page,
+ *         HPAGE_PMD_NR - 1 for THP head page)
 *
 * called from munlock()/munmap() path with page supposedly on the LRU.
 * When we munlock a page, because the vma where we found the page is being
@@ -148,21 +151,30 @@ static void __munlock_isolation_failed(struct page *page)
 */
 unsigned int munlock_vma_page(struct page *page)
 {
-        unsigned int page_mask = 0;
+        unsigned int nr_pages;
        BUG_ON(!PageLocked(page));
        if (TestClearPageMlocked(page)) {
-                unsigned int nr_pages = hpage_nr_pages(page);
+                nr_pages = hpage_nr_pages(page);
                mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
-                page_mask = nr_pages - 1;
                if (!isolate_lru_page(page))
                        __munlock_isolated_page(page);
                else
                        __munlock_isolation_failed(page);
+        } else {
+                nr_pages = hpage_nr_pages(page);
        }
-        return page_mask;
+        /*
+         * Regardless of the original PageMlocked flag, we determine nr_pages
+         * after touching the flag. This leaves a possible race with a THP page
+         * split, such that a whole THP page was munlocked, but nr_pages == 1.
+         * Returning a smaller mask due to that is OK, the worst that can
+         * happen is subsequent useless scanning of the former tail pages.
+         * The NR_MLOCK accounting can however become broken.
+         */
+        return nr_pages - 1;
 }
 /**
@@ -286,10 +298,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 {
        int i;
        int nr = pagevec_count(pvec);
-        int delta_munlocked = -nr;
+        int delta_munlocked;
        struct pagevec pvec_putback;
        int pgrescued = 0;
+        pagevec_init(&pvec_putback, 0);
        /* Phase 1: page isolation */
        spin_lock_irq(&zone->lru_lock);
        for (i = 0; i < nr; i++) {
@@ -318,18 +332,21 @@ skip_munlock:
                        /*
                         * We won't be munlocking this page in the next phase
                         * but we still need to release the follow_page_mask()
-                         * pin.
+                         * pin. We cannot do it under lru_lock however. If it's
+                         * the last pin, __page_cache_release would deadlock.
                         */
+                        pagevec_add(&pvec_putback, pvec->pages[i]);
                        pvec->pages[i] = NULL;
-                        put_page(page);
-                        delta_munlocked++;
                }
        }
+        delta_munlocked = -nr + pagevec_count(&pvec_putback);
        __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
        spin_unlock_irq(&zone->lru_lock);
+        /* Now we can release pins of pages that we are not munlocking */
+        pagevec_release(&pvec_putback);
        /* Phase 2: page munlock */
-        pagevec_init(&pvec_putback, 0);
        for (i = 0; i < nr; i++) {
                struct page *page = pvec->pages[i];
@@ -440,7 +457,8 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
        while (start < end) {
                struct page *page = NULL;
-                unsigned int page_mask, page_increm;
+                unsigned int page_mask;
+                unsigned long page_increm;
                struct pagevec pvec;
                struct zone *zone;
                int zoneid;
@@ -490,7 +508,9 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                                goto next;
                        }
                }
-                page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+                /* It's a bug to munlock in the middle of a THP page */
+                VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
+                page_increm = 1 + page_mask;
                start += page_increm * PAGE_SIZE;
 next:
                cond_resched();
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 26667971c824..bb53a6591aea 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -52,17 +52,21 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        pte_t ptent;
                        bool updated = false;
-                        ptent = ptep_modify_prot_start(mm, addr, pte);
                        if (!prot_numa) {
+                                ptent = ptep_modify_prot_start(mm, addr, pte);
+                                if (pte_numa(ptent))
+                                        ptent = pte_mknonnuma(ptent);
                                ptent = pte_modify(ptent, newprot);
                                updated = true;
                        } else {
                                struct page *page;
+                                ptent = *pte;
                                page = vm_normal_page(vma, addr, oldpte);
                                if (page) {
                                        if (!pte_numa(oldpte)) {
                                                ptent = pte_mknuma(ptent);
+                                                set_pte_at(mm, addr, pte, ptent);
                                                updated = true;
                                        }
                                }
@@ -79,7 +83,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        if (updated)
                                pages++;
-                        ptep_modify_prot_commit(mm, addr, pte, ptent);
+                        /* Only !prot_numa always clears the pte */
+                        if (!prot_numa)
+                                ptep_modify_prot_commit(mm, addr, pte, ptent);
                } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -181,6 +188,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
        BUG_ON(addr >= end);
        pgd = pgd_offset(mm, addr);
        flush_cache_range(vma, addr, end);
+        set_tlb_flush_pending(mm);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
@@ -192,6 +200,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
        /* Only flush the TLB if we actually modified any entries: */
        if (pages)
                flush_tlb_range(vma, start, end);
+        clear_tlb_flush_pending(mm);
        return pages;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 580a5f075ed0..5248fe070aa4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1816,7 +1816,7 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
 static bool zone_local(struct zone *local_zone, struct zone *zone)
 {
-        return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE;
+        return local_zone->node == zone->node;
 }
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
@@ -1913,18 +1913,17 @@ zonelist_scan:
                 * page was allocated in should have no effect on the
                 * time the page has in memory before being reclaimed.
                 *
-                 * When zone_reclaim_mode is enabled, try to stay in
+                 * Try to stay in local zones in the fastpath.  If
-                 * local zones in the fastpath.  If that fails, the
+                 * that fails, the slowpath is entered, which will do
-                 * slowpath is entered, which will do another pass
+                 * another pass starting with the local zones, but
-                 * starting with the local zones, but ultimately fall
+                 * ultimately fall back to remote zones that do not
-                 * back to remote zones that do not partake in the
+                 * partake in the fairness round-robin cycle of this
-                 * fairness round-robin cycle of this zonelist.
+                 * zonelist.
                 */
                if (alloc_flags & ALLOC_WMARK_LOW) {
                        if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
                                continue;
-                        if (zone_reclaim_mode &&
+                        if (!zone_local(preferred_zone, zone))
-                            !zone_local(preferred_zone, zone))
                                continue;
                }
                /*
@@ -2390,7 +2389,7 @@ static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
                 * thrash fairness information for zones that are not
                 * actually part of this zonelist's round-robin cycle.
                 */
-                if (zone_reclaim_mode && !zone_local(preferred_zone, zone))
+                if (!zone_local(preferred_zone, zone))
                        continue;
                mod_zone_page_state(zone, NR_ALLOC_BATCH,
                                    high_wmark_pages(zone) -
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index cbb38545d9d6..a8b919925934 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -110,9 +110,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
 pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
                       pte_t *ptep)
 {
+        struct mm_struct *mm = (vma)->vm_mm;
        pte_t pte;
-        pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
+        pte = ptep_get_and_clear(mm, address, ptep);
-        if (pte_accessible(pte))
+        if (pte_accessible(mm, pte))
                flush_tlb_page(vma, address);
        return pte;
 }
@@ -191,6 +192,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                     pmd_t *pmdp)
 {
+        pmd_t entry = *pmdp;
+        if (pmd_numa(entry))
+                entry = pmd_mknonnuma(entry);
        set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 55c8b8dc9ffb..068522d8502a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -600,7 +600,11 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
        spinlock_t *ptl;
        if (unlikely(PageHuge(page))) {
+                /* when pud is not present, pte will be NULL */
                pte = huge_pte_offset(mm, address);
+                if (!pte)
+                        return NULL;
                ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
                goto check;
        }