thp: reintroduce split_huge_page()

This patch adds implementation of split_huge_page() for new refcountings. Unlike previous implementation, new split_huge_page() can fail if somebody holds GUP pin on the page. It also means that pin on page would prevent it from bening split under you. It makes situation in many places much cleaner. The basic scheme of split_huge_page(): - Check that sum of mapcounts of all subpage is equal to page_count() plus one (caller pin). Foll off with -EBUSY. This way we can avoid useless PMD-splits. - Freeze the page counters by splitting all PMD and setup migration PTEs. - Re-check sum of mapcounts against page_count(). Page's counts are stable now. -EBUSY if page is pinned. - Split compound page. - Unfreeze the page by removing migration entries. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Sasha Levin <sasha.levin@oracle.com> Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Acked-by: Jerome Marchand <jmarchan@redhat.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Rik van Riel <riel@redhat.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Steve Capper <steve.capper@linaro.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> 2016-01-15 19:54:10 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-01-15 20:56:32 -0500
commit: e9b61f19858a5d6c42ce2298cf138279375d0d9b (patch)
tree: ba2f5851d193c0ab96af67a9df9856b1dd9480ad /mm/huge_memory.c
parent: 4e41a30c6d506c884d3da9aeb316352e70679d4b (diff)
1 files changed, 371 insertions, 3 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 913559388fda..b6ac6c43d6a4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -16,6 +16,7 @@
 #include <linux/swap.h>
 #include <linux/shrinker.h>
 #include <linux/mm_inline.h>
+#include <linux/swapops.h>
 #include <linux/dax.h>
 #include <linux/kthread.h>
 #include <linux/khugepaged.h>
@@ -2726,9 +2727,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
        write = pmd_write(*pmd);
        young = pmd_young(*pmd);
-        /* leave pmd empty until pte is filled */
-        pmdp_huge_clear_flush_notify(vma, haddr, pmd);
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
@@ -2778,7 +2776,36 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
        }
        smp_wmb(); /* make pte visible before pmd */
+        /*
+         * Up to this point the pmd is present and huge and userland has the
+         * whole access to the hugepage during the split (which happens in
+         * place). If we overwrite the pmd with the not-huge version pointing
+         * to the pte here (which of course we could if all CPUs were bug
+         * free), userland could trigger a small page size TLB miss on the
+         * small sized TLB while the hugepage TLB entry is still established in
+         * the huge TLB. Some CPU doesn't like that.
+         * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
+         * 383 on page 93. Intel should be safe but is also warns that it's
+         * only safe if the permission and cache attributes of the two entries
+         * loaded in the two TLB is identical (which should be the case here).
+         * But it is generally safer to never allow small and huge TLB entries
+         * for the same virtual address to be loaded simultaneously. So instead
+         * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
+         * current pmd notpresent (atomically because here the pmd_trans_huge
+         * and pmd_trans_splitting must remain set at all times on the pmd
+         * until the split is complete for this pmd), then we flush the SMP TLB
+         * and finally we write the non-huge version of the pmd entry with
+         * pmd_populate.
+         */
+        pmdp_invalidate(vma, haddr, pmd);
        pmd_populate(mm, pmd, pgtable);
+        if (freeze) {
+                for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+                        page_remove_rmap(page + i, false);
+                        put_page(page + i);
+                }
+        }
 }
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
@@ -2863,3 +2890,344 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
                        split_huge_pmd_address(next, nstart);
        }
 }
+static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
+                unsigned long address)
+{
+        spinlock_t *ptl;
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        int i, nr = HPAGE_PMD_NR;
+        /* Skip pages which doesn't belong to the VMA */
+        if (address < vma->vm_start) {
+                int off = (vma->vm_start - address) >> PAGE_SHIFT;
+                page += off;
+                nr -= off;
+                address = vma->vm_start;
+        }
+        pgd = pgd_offset(vma->vm_mm, address);
+        if (!pgd_present(*pgd))
+                return;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                return;
+        pmd = pmd_offset(pud, address);
+        ptl = pmd_lock(vma->vm_mm, pmd);
+        if (!pmd_present(*pmd)) {
+                spin_unlock(ptl);
+                return;
+        }
+        if (pmd_trans_huge(*pmd)) {
+                if (page == pmd_page(*pmd))
+                        __split_huge_pmd_locked(vma, pmd, address, true);
+                spin_unlock(ptl);
+                return;
+        }
+        spin_unlock(ptl);
+        pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+        for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) {
+                pte_t entry, swp_pte;
+                swp_entry_t swp_entry;
+                if (!pte_present(pte[i]))
+                        continue;
+                if (page_to_pfn(page) != pte_pfn(pte[i]))
+                        continue;
+                flush_cache_page(vma, address, page_to_pfn(page));
+                entry = ptep_clear_flush(vma, address, pte + i);
+                swp_entry = make_migration_entry(page, pte_write(entry));
+                swp_pte = swp_entry_to_pte(swp_entry);
+                if (pte_soft_dirty(entry))
+                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                set_pte_at(vma->vm_mm, address, pte + i, swp_pte);
+                page_remove_rmap(page, false);
+                put_page(page);
+        }
+        pte_unmap_unlock(pte, ptl);
+}
+static void freeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+        struct anon_vma_chain *avc;
+        pgoff_t pgoff = page_to_pgoff(page);
+        VM_BUG_ON_PAGE(!PageHead(page), page);
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
+                        pgoff + HPAGE_PMD_NR - 1) {
+                unsigned long haddr;
+                haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
+                mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+                                haddr, haddr + HPAGE_PMD_SIZE);
+                freeze_page_vma(avc->vma, page, haddr);
+                mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+                                haddr, haddr + HPAGE_PMD_SIZE);
+        }
+}
+static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
+                unsigned long address)
+{
+        spinlock_t *ptl;
+        pmd_t *pmd;
+        pte_t *pte, entry;
+        swp_entry_t swp_entry;
+        int i, nr = HPAGE_PMD_NR;
+        /* Skip pages which doesn't belong to the VMA */
+        if (address < vma->vm_start) {
+                int off = (vma->vm_start - address) >> PAGE_SHIFT;
+                page += off;
+                nr -= off;
+                address = vma->vm_start;
+        }
+        pmd = mm_find_pmd(vma->vm_mm, address);
+        if (!pmd)
+                return;
+        pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+        for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) {
+                if (!is_swap_pte(pte[i]))
+                        continue;
+                swp_entry = pte_to_swp_entry(pte[i]);
+                if (!is_migration_entry(swp_entry))
+                        continue;
+                if (migration_entry_to_page(swp_entry) != page)
+                        continue;
+                get_page(page);
+                page_add_anon_rmap(page, vma, address, false);
+                entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
+                entry = pte_mkdirty(entry);
+                if (is_write_migration_entry(swp_entry))
+                        entry = maybe_mkwrite(entry, vma);
+                flush_dcache_page(page);
+                set_pte_at(vma->vm_mm, address, pte + i, entry);
+                /* No need to invalidate - it was non-present before */
+                update_mmu_cache(vma, address, pte + i);
+        }
+        pte_unmap_unlock(pte, ptl);
+}
+static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+        struct anon_vma_chain *avc;
+        pgoff_t pgoff = page_to_pgoff(page);
+        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+                        pgoff, pgoff + HPAGE_PMD_NR - 1) {
+                unsigned long address = __vma_address(page, avc->vma);
+                mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+                                address, address + HPAGE_PMD_SIZE);
+                unfreeze_page_vma(avc->vma, page, address);
+                mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+                                address, address + HPAGE_PMD_SIZE);
+        }
+}
+static int total_mapcount(struct page *page)
+{
+        int i, ret;
+        ret = compound_mapcount(page);
+        for (i = 0; i < HPAGE_PMD_NR; i++)
+                ret += atomic_read(&page[i]._mapcount) + 1;
+        if (PageDoubleMap(page))
+                ret -= HPAGE_PMD_NR;
+        return ret;
+}
+static int __split_huge_page_tail(struct page *head, int tail,
+                struct lruvec *lruvec, struct list_head *list)
+{
+        int mapcount;
+        struct page *page_tail = head + tail;
+        mapcount = atomic_read(&page_tail->_mapcount) + 1;
+        VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+        /*
+         * tail_page->_count is zero and not changing from under us. But
+         * get_page_unless_zero() may be running from under us on the
+         * tail_page. If we used atomic_set() below instead of atomic_add(), we
+         * would then run atomic_set() concurrently with
+         * get_page_unless_zero(), and atomic_set() is implemented in C not
+         * using locked ops. spin_unlock on x86 sometime uses locked ops
+         * because of PPro errata 66, 92, so unless somebody can guarantee
+         * atomic_set() here would be safe on all archs (and not only on x86),
+         * it's safer to use atomic_add().
+         */
+        atomic_add(mapcount + 1, &page_tail->_count);
+        page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+        page_tail->flags |= (head->flags &
+                        ((1L << PG_referenced) |
+                         (1L << PG_swapbacked) |
+                         (1L << PG_mlocked) |
+                         (1L << PG_uptodate) |
+                         (1L << PG_active) |
+                         (1L << PG_locked) |
+                         (1L << PG_unevictable)));
+        page_tail->flags |= (1L << PG_dirty);
+        /*
+         * After clearing PageTail the gup refcount can be released.
+         * Page flags also must be visible before we make the page non-compound.
+         */
+        smp_wmb();
+        clear_compound_head(page_tail);
+        if (page_is_young(head))
+                set_page_young(page_tail);
+        if (page_is_idle(head))
+                set_page_idle(page_tail);
+        /* ->mapping in first tail page is compound_mapcount */
+        VM_BUG_ON_PAGE(tail != 1 && page_tail->mapping != TAIL_MAPPING,
+                        page_tail);
+        page_tail->mapping = head->mapping;
+        page_tail->index = head->index + tail;
+        page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+        lru_add_page_tail(head, page_tail, lruvec, list);
+        return mapcount;
+}
+static void __split_huge_page(struct page *page, struct list_head *list)
+{
+        struct page *head = compound_head(page);
+        struct zone *zone = page_zone(head);
+        struct lruvec *lruvec;
+        int i, tail_mapcount;
+        /* prevent PageLRU to go away from under us, and freeze lru stats */
+        spin_lock_irq(&zone->lru_lock);
+        lruvec = mem_cgroup_page_lruvec(head, zone);
+        /* complete memcg works before add pages to LRU */
+        mem_cgroup_split_huge_fixup(head);
+        tail_mapcount = 0;
+        for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
+                tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
+        atomic_sub(tail_mapcount, &head->_count);
+        ClearPageCompound(head);
+        spin_unlock_irq(&zone->lru_lock);
+        unfreeze_page(page_anon_vma(head), head);
+        for (i = 0; i < HPAGE_PMD_NR; i++) {
+                struct page *subpage = head + i;
+                if (subpage == page)
+                        continue;
+                unlock_page(subpage);
+                /*
+                 * Subpages may be freed if there wasn't any mapping
+                 * like if add_to_swap() is running on a lru page that
+                 * had its mapping zapped. And freeing these pages
+                 * requires taking the lru_lock so we do the put_page
+                 * of the tail pages after the split is complete.
+                 */
+                put_page(subpage);
+        }
+}
+/*
+ * This function splits huge page into normal pages. @page can point to any
+ * subpage of huge page to split. Split doesn't change the position of @page.
+ *
+ * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
+ * The huge page must be locked.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Both head page and tail pages will inherit mapping, flags, and so on from
+ * the hugepage.
+ *
+ * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
+ * they are not mapped.
+ *
+ * Returns 0 if the hugepage is split successfully.
+ * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
+ * us.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+        struct page *head = compound_head(page);
+        struct anon_vma *anon_vma;
+        int count, mapcount, ret;
+        VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+        VM_BUG_ON_PAGE(!PageAnon(page), page);
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
+        VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+        VM_BUG_ON_PAGE(!PageCompound(page), page);
+        /*
+         * The caller does not necessarily hold an mmap_sem that would prevent
+         * the anon_vma disappearing so we first we take a reference to it
+         * and then lock the anon_vma for write. This is similar to
+         * page_lock_anon_vma_read except the write lock is taken to serialise
+         * against parallel split or collapse operations.
+         */
+        anon_vma = page_get_anon_vma(head);
+        if (!anon_vma) {
+                ret = -EBUSY;
+                goto out;
+        }
+        anon_vma_lock_write(anon_vma);
+        /*
+         * Racy check if we can split the page, before freeze_page() will
+         * split PMDs
+         */
+        if (total_mapcount(head) != page_count(head) - 1) {
+                ret = -EBUSY;
+                goto out_unlock;
+        }
+        freeze_page(anon_vma, head);
+        VM_BUG_ON_PAGE(compound_mapcount(head), head);
+        count = page_count(head);
+        mapcount = total_mapcount(head);
+        if (mapcount == count - 1) {
+                __split_huge_page(page, list);
+                ret = 0;
+        } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) {
+                pr_alert("total_mapcount: %u, page_count(): %u\n",
+                                mapcount, count);
+                if (PageTail(page))
+                        dump_page(head, NULL);
+                dump_page(page, "total_mapcount(head) > page_count(head) - 1");
+                BUG();
+        } else {
+                unfreeze_page(anon_vma, head);
+                ret = -EBUSY;
+        }
+out_unlock:
+        anon_vma_unlock_write(anon_vma);
+        put_anon_vma(anon_vma);
+out:
+        count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+        return ret;
+}
author	Kirill A. Shutemov <kirill.shutemov@linux.intel.com>	2016-01-15 19:54:10 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-01-15 20:56:32 -0500
commit	e9b61f19858a5d6c42ce2298cf138279375d0d9b (patch)
tree	ba2f5851d193c0ab96af67a9df9856b1dd9480ad /mm/huge_memory.c
parent	4e41a30c6d506c884d3da9aeb316352e70679d4b (diff)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 913559388fda..b6ac6c43d6a4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c
@@ -16,6 +16,7 @@
16	#include <linux/swap.h>	16	#include <linux/swap.h>
17	#include <linux/shrinker.h>	17	#include <linux/shrinker.h>
18	#include <linux/mm_inline.h>	18	#include <linux/mm_inline.h>
		19	#include <linux/swapops.h>
19	#include <linux/dax.h>	20	#include <linux/dax.h>
20	#include <linux/kthread.h>	21	#include <linux/kthread.h>
21	#include <linux/khugepaged.h>	22	#include <linux/khugepaged.h>
@@ -2726,9 +2727,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct vma, pmd_t pmd,
2726	write = pmd_write(*pmd);	2727	write = pmd_write(*pmd);
2727	young = pmd_young(*pmd);	2728	young = pmd_young(*pmd);
2728		2729
2729	/* leave pmd empty until pte is filled */
2730	pmdp_huge_clear_flush_notify(vma, haddr, pmd);
2731
2732	pgtable = pgtable_trans_huge_withdraw(mm, pmd);	2730	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2733	pmd_populate(mm, &_pmd, pgtable);	2731	pmd_populate(mm, &_pmd, pgtable);
2734		2732
@@ -2778,7 +2776,36 @@ static void __split_huge_pmd_locked(struct vm_area_struct vma, pmd_t pmd,
2778	}	2776	}
2779		2777
2780	smp_wmb(); /* make pte visible before pmd */	2778	smp_wmb(); /* make pte visible before pmd */
		2779	/*
		2780	* Up to this point the pmd is present and huge and userland has the
		2781	* whole access to the hugepage during the split (which happens in
		2782	* place). If we overwrite the pmd with the not-huge version pointing
		2783	* to the pte here (which of course we could if all CPUs were bug
		2784	* free), userland could trigger a small page size TLB miss on the
		2785	* small sized TLB while the hugepage TLB entry is still established in
		2786	* the huge TLB. Some CPU doesn't like that.
		2787	* See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
		2788	* 383 on page 93. Intel should be safe but is also warns that it's
		2789	* only safe if the permission and cache attributes of the two entries
		2790	* loaded in the two TLB is identical (which should be the case here).
		2791	* But it is generally safer to never allow small and huge TLB entries
		2792	* for the same virtual address to be loaded simultaneously. So instead
		2793	* of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
		2794	* current pmd notpresent (atomically because here the pmd_trans_huge
		2795	* and pmd_trans_splitting must remain set at all times on the pmd
		2796	* until the split is complete for this pmd), then we flush the SMP TLB
		2797	* and finally we write the non-huge version of the pmd entry with
		2798	* pmd_populate.
		2799	*/
		2800	pmdp_invalidate(vma, haddr, pmd);
2781	pmd_populate(mm, pmd, pgtable);	2801	pmd_populate(mm, pmd, pgtable);
		2802
		2803	if (freeze) {
		2804	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
		2805	page_remove_rmap(page + i, false);
		2806	put_page(page + i);
		2807	}
		2808	}
2782	}	2809	}
2783		2810
2784	void __split_huge_pmd(struct vm_area_struct vma, pmd_t pmd,	2811	void __split_huge_pmd(struct vm_area_struct vma, pmd_t pmd,
@@ -2863,3 +2890,344 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
2863	split_huge_pmd_address(next, nstart);	2890	split_huge_pmd_address(next, nstart);
2864	}	2891	}
2865	}	2892	}
		2893
		2894	static void freeze_page_vma(struct vm_area_struct vma, struct page page,
		2895	unsigned long address)
		2896	{
		2897	spinlock_t *ptl;
		2898	pgd_t *pgd;
		2899	pud_t *pud;
		2900	pmd_t *pmd;
		2901	pte_t *pte;
		2902	int i, nr = HPAGE_PMD_NR;
		2903
		2904	/* Skip pages which doesn't belong to the VMA */
		2905	if (address < vma->vm_start) {
		2906	int off = (vma->vm_start - address) >> PAGE_SHIFT;
		2907	page += off;
		2908	nr -= off;
		2909	address = vma->vm_start;
		2910	}
		2911
		2912	pgd = pgd_offset(vma->vm_mm, address);
		2913	if (!pgd_present(*pgd))
		2914	return;
		2915	pud = pud_offset(pgd, address);
		2916	if (!pud_present(*pud))
		2917	return;
		2918	pmd = pmd_offset(pud, address);
		2919	ptl = pmd_lock(vma->vm_mm, pmd);
		2920	if (!pmd_present(*pmd)) {
		2921	spin_unlock(ptl);
		2922	return;
		2923	}
		2924	if (pmd_trans_huge(*pmd)) {
		2925	if (page == pmd_page(*pmd))
		2926	__split_huge_pmd_locked(vma, pmd, address, true);
		2927	spin_unlock(ptl);
		2928	return;
		2929	}
		2930	spin_unlock(ptl);
		2931
		2932	pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
		2933	for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) {
		2934	pte_t entry, swp_pte;
		2935	swp_entry_t swp_entry;
		2936
		2937	if (!pte_present(pte[i]))
		2938	continue;
		2939	if (page_to_pfn(page) != pte_pfn(pte[i]))
		2940	continue;
		2941	flush_cache_page(vma, address, page_to_pfn(page));
		2942	entry = ptep_clear_flush(vma, address, pte + i);
		2943	swp_entry = make_migration_entry(page, pte_write(entry));
		2944	swp_pte = swp_entry_to_pte(swp_entry);
		2945	if (pte_soft_dirty(entry))
		2946	swp_pte = pte_swp_mksoft_dirty(swp_pte);
		2947	set_pte_at(vma->vm_mm, address, pte + i, swp_pte);
		2948	page_remove_rmap(page, false);
		2949	put_page(page);
		2950	}
		2951	pte_unmap_unlock(pte, ptl);
		2952	}
		2953
		2954	static void freeze_page(struct anon_vma anon_vma, struct page page)
		2955	{
		2956	struct anon_vma_chain *avc;
		2957	pgoff_t pgoff = page_to_pgoff(page);
		2958
		2959	VM_BUG_ON_PAGE(!PageHead(page), page);
		2960
		2961	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
		2962	pgoff + HPAGE_PMD_NR - 1) {
		2963	unsigned long haddr;
		2964
		2965	haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
		2966	mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
		2967	haddr, haddr + HPAGE_PMD_SIZE);
		2968	freeze_page_vma(avc->vma, page, haddr);
		2969	mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
		2970	haddr, haddr + HPAGE_PMD_SIZE);
		2971	}
		2972	}
		2973
		2974	static void unfreeze_page_vma(struct vm_area_struct vma, struct page page,
		2975	unsigned long address)
		2976	{
		2977	spinlock_t *ptl;
		2978	pmd_t *pmd;
		2979	pte_t *pte, entry;
		2980	swp_entry_t swp_entry;
		2981	int i, nr = HPAGE_PMD_NR;
		2982
		2983	/* Skip pages which doesn't belong to the VMA */
		2984	if (address < vma->vm_start) {
		2985	int off = (vma->vm_start - address) >> PAGE_SHIFT;
		2986	page += off;
		2987	nr -= off;
		2988	address = vma->vm_start;
		2989	}
		2990
		2991	pmd = mm_find_pmd(vma->vm_mm, address);
		2992	if (!pmd)
		2993	return;
		2994	pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
		2995	for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) {
		2996	if (!is_swap_pte(pte[i]))
		2997	continue;
		2998
		2999	swp_entry = pte_to_swp_entry(pte[i]);
		3000	if (!is_migration_entry(swp_entry))
		3001	continue;
		3002	if (migration_entry_to_page(swp_entry) != page)
		3003	continue;
		3004
		3005	get_page(page);
		3006	page_add_anon_rmap(page, vma, address, false);
		3007
		3008	entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
		3009	entry = pte_mkdirty(entry);
		3010	if (is_write_migration_entry(swp_entry))
		3011	entry = maybe_mkwrite(entry, vma);
		3012
		3013	flush_dcache_page(page);
		3014	set_pte_at(vma->vm_mm, address, pte + i, entry);
		3015
		3016	/* No need to invalidate - it was non-present before */
		3017	update_mmu_cache(vma, address, pte + i);
		3018	}
		3019	pte_unmap_unlock(pte, ptl);
		3020	}
		3021
		3022	static void unfreeze_page(struct anon_vma anon_vma, struct page page)
		3023	{
		3024	struct anon_vma_chain *avc;
		3025	pgoff_t pgoff = page_to_pgoff(page);
		3026
		3027	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
		3028	pgoff, pgoff + HPAGE_PMD_NR - 1) {
		3029	unsigned long address = __vma_address(page, avc->vma);
		3030
		3031	mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
		3032	address, address + HPAGE_PMD_SIZE);
		3033	unfreeze_page_vma(avc->vma, page, address);
		3034	mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
		3035	address, address + HPAGE_PMD_SIZE);
		3036	}
		3037	}
		3038
		3039	static int total_mapcount(struct page *page)
		3040	{
		3041	int i, ret;
		3042
		3043	ret = compound_mapcount(page);
		3044	for (i = 0; i < HPAGE_PMD_NR; i++)
		3045	ret += atomic_read(&page[i]._mapcount) + 1;
		3046
		3047	if (PageDoubleMap(page))
		3048	ret -= HPAGE_PMD_NR;
		3049
		3050	return ret;
		3051	}
		3052
		3053	static int __split_huge_page_tail(struct page *head, int tail,
		3054	struct lruvec lruvec, struct list_head list)
		3055	{
		3056	int mapcount;
		3057	struct page *page_tail = head + tail;
		3058
		3059	mapcount = atomic_read(&page_tail->_mapcount) + 1;
		3060	VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
		3061
		3062	/*
		3063	* tail_page->_count is zero and not changing from under us. But
		3064	* get_page_unless_zero() may be running from under us on the
		3065	* tail_page. If we used atomic_set() below instead of atomic_add(), we
		3066	* would then run atomic_set() concurrently with
		3067	* get_page_unless_zero(), and atomic_set() is implemented in C not
		3068	* using locked ops. spin_unlock on x86 sometime uses locked ops
		3069	* because of PPro errata 66, 92, so unless somebody can guarantee
		3070	* atomic_set() here would be safe on all archs (and not only on x86),
		3071	* it's safer to use atomic_add().
		3072	*/
		3073	atomic_add(mapcount + 1, &page_tail->_count);
		3074
		3075
		3076	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
		3077	page_tail->flags \|= (head->flags &
		3078	((1L << PG_referenced) \|
		3079	(1L << PG_swapbacked) \|
		3080	(1L << PG_mlocked) \|
		3081	(1L << PG_uptodate) \|
		3082	(1L << PG_active) \|
		3083	(1L << PG_locked) \|
		3084	(1L << PG_unevictable)));
		3085	page_tail->flags \|= (1L << PG_dirty);
		3086
		3087	/*
		3088	* After clearing PageTail the gup refcount can be released.
		3089	* Page flags also must be visible before we make the page non-compound.
		3090	*/
		3091	smp_wmb();
		3092
		3093	clear_compound_head(page_tail);
		3094
		3095	if (page_is_young(head))
		3096	set_page_young(page_tail);
		3097	if (page_is_idle(head))
		3098	set_page_idle(page_tail);
		3099
		3100	/* ->mapping in first tail page is compound_mapcount */
		3101	VM_BUG_ON_PAGE(tail != 1 && page_tail->mapping != TAIL_MAPPING,
		3102	page_tail);
		3103	page_tail->mapping = head->mapping;
		3104
		3105	page_tail->index = head->index + tail;
		3106	page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
		3107	lru_add_page_tail(head, page_tail, lruvec, list);
		3108
		3109	return mapcount;
		3110	}
		3111
		3112	static void __split_huge_page(struct page page, struct list_head list)
		3113	{
		3114	struct page *head = compound_head(page);
		3115	struct zone *zone = page_zone(head);
		3116	struct lruvec *lruvec;
		3117	int i, tail_mapcount;
		3118
		3119	/* prevent PageLRU to go away from under us, and freeze lru stats */
		3120	spin_lock_irq(&zone->lru_lock);
		3121	lruvec = mem_cgroup_page_lruvec(head, zone);
		3122
		3123	/* complete memcg works before add pages to LRU */
		3124	mem_cgroup_split_huge_fixup(head);
		3125
		3126	tail_mapcount = 0;
		3127	for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
		3128	tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
		3129	atomic_sub(tail_mapcount, &head->_count);
		3130
		3131	ClearPageCompound(head);
		3132	spin_unlock_irq(&zone->lru_lock);
		3133
		3134	unfreeze_page(page_anon_vma(head), head);
		3135
		3136	for (i = 0; i < HPAGE_PMD_NR; i++) {
		3137	struct page *subpage = head + i;
		3138	if (subpage == page)
		3139	continue;
		3140	unlock_page(subpage);
		3141
		3142	/*
		3143	* Subpages may be freed if there wasn't any mapping
		3144	* like if add_to_swap() is running on a lru page that
		3145	* had its mapping zapped. And freeing these pages
		3146	* requires taking the lru_lock so we do the put_page
		3147	* of the tail pages after the split is complete.
		3148	*/
		3149	put_page(subpage);
		3150	}
		3151	}
		3152
		3153	/*
		3154	* This function splits huge page into normal pages. @page can point to any
		3155	* subpage of huge page to split. Split doesn't change the position of @page.
		3156	*
		3157	* Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
		3158	* The huge page must be locked.
		3159	*
		3160	* If @list is null, tail pages will be added to LRU list, otherwise, to @list.
		3161	*
		3162	* Both head page and tail pages will inherit mapping, flags, and so on from
		3163	* the hugepage.
		3164	*
		3165	* GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
		3166	* they are not mapped.
		3167	*
		3168	* Returns 0 if the hugepage is split successfully.
		3169	* Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
		3170	* us.
		3171	*/
		3172	int split_huge_page_to_list(struct page page, struct list_head list)
		3173	{
		3174	struct page *head = compound_head(page);
		3175	struct anon_vma *anon_vma;
		3176	int count, mapcount, ret;
		3177
		3178	VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
		3179	VM_BUG_ON_PAGE(!PageAnon(page), page);
		3180	VM_BUG_ON_PAGE(!PageLocked(page), page);
		3181	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
		3182	VM_BUG_ON_PAGE(!PageCompound(page), page);
		3183
		3184	/*
		3185	* The caller does not necessarily hold an mmap_sem that would prevent
		3186	* the anon_vma disappearing so we first we take a reference to it
		3187	* and then lock the anon_vma for write. This is similar to
		3188	* page_lock_anon_vma_read except the write lock is taken to serialise
		3189	* against parallel split or collapse operations.
		3190	*/
		3191	anon_vma = page_get_anon_vma(head);
		3192	if (!anon_vma) {
		3193	ret = -EBUSY;
		3194	goto out;
		3195	}
		3196	anon_vma_lock_write(anon_vma);
		3197
		3198	/*
		3199	* Racy check if we can split the page, before freeze_page() will
		3200	* split PMDs
		3201	*/
		3202	if (total_mapcount(head) != page_count(head) - 1) {
		3203	ret = -EBUSY;
		3204	goto out_unlock;
		3205	}
		3206
		3207	freeze_page(anon_vma, head);
		3208	VM_BUG_ON_PAGE(compound_mapcount(head), head);
		3209
		3210	count = page_count(head);
		3211	mapcount = total_mapcount(head);
		3212	if (mapcount == count - 1) {
		3213	__split_huge_page(page, list);
		3214	ret = 0;
		3215	} else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) {
		3216	pr_alert("total_mapcount: %u, page_count(): %u\n",
		3217	mapcount, count);
		3218	if (PageTail(page))
		3219	dump_page(head, NULL);
		3220	dump_page(page, "total_mapcount(head) > page_count(head) - 1");
		3221	BUG();
		3222	} else {
		3223	unfreeze_page(anon_vma, head);
		3224	ret = -EBUSY;
		3225	}
		3226
		3227	out_unlock:
		3228	anon_vma_unlock_write(anon_vma);
		3229	put_anon_vma(anon_vma);
		3230	out:
		3231	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
		3232	return ret;
		3233	}