mm/migrate: new memory migration helper for use with device memory

This patch add a new memory migration helpers, which migrate memory backing a range of virtual address of a process to different memory (which can be allocated through special allocator). It differs from numa migration by working on a range of virtual address and thus by doing migration in chunk that can be large enough to use DMA engine or special copy offloading engine. Expected users are any one with heterogeneous memory where different memory have different characteristics (latency, bandwidth, ...). As an example IBM platform with CAPI bus can make use of this feature to migrate between regular memory and CAPI device memory. New CPU architecture with a pool of high performance memory not manage as cache but presented as regular memory (while being faster and with lower latency than DDR) will also be prime user of this patch. Migration to private device memory will be useful for device that have large pool of such like GPU, NVidia plans to use HMM for that. Link: http://lkml.kernel.org/r/20170817000548.32038-15-jglisse@redhat.com Signed-off-by: Jérôme Glisse <jglisse@redhat.com> Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com> Signed-off-by: John Hubbard <jhubbard@nvidia.com> Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com> Signed-off-by: Sherry Cheung <SCheung@nvidia.com> Signed-off-by: Subhash Gutti <sgutti@nvidia.com> Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Nellans <dnellans@nvidia.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Bob Liu <liubo95@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Jérôme Glisse <jglisse@redhat.com> 2017-09-08 19:12:09 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-09-08 21:26:46 -0400
commit: 8763cb45ab967a92a5ee49e9c544c0f0ea90e2d6 (patch)
tree: a2b5041d068fd69ee8a60c6c3ec8adb004ad0ced /mm/migrate.c
parent: 2916ecc0f9d435d849c98f4da50e453124c87531 (diff)
1 files changed, 492 insertions, 0 deletions
diff --git a/mm/migrate.c b/mm/migrate.c
index 71de36cfb673..991e8886093f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -428,6 +428,14 @@ int migrate_page_move_mapping(struct address_space *mapping,
        int expected_count = 1 + extra_count;
        void **pslot;
+        /*
+         * ZONE_DEVICE pages have 1 refcount always held by their device
+         *
+         * Note that DAX memory will never reach that point as it does not have
+         * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h).
+         */
+        expected_count += is_zone_device_page(page);
        if (!mapping) {
                /* Anonymous page without mapping */
                if (page_count(page) != expected_count)
@@ -2106,3 +2114,487 @@ out_unlock:
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_NUMA */
+struct migrate_vma {
+        struct vm_area_struct   *vma;
+        unsigned long           *dst;
+        unsigned long           *src;
+        unsigned long           cpages;
+        unsigned long           npages;
+        unsigned long           start;
+        unsigned long           end;
+};
+static int migrate_vma_collect_hole(unsigned long start,
+                                    unsigned long end,
+                                    struct mm_walk *walk)
+{
+        struct migrate_vma *migrate = walk->private;
+        unsigned long addr;
+        for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+                migrate->dst[migrate->npages] = 0;
+                migrate->src[migrate->npages++] = 0;
+        }
+        return 0;
+}
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+                                   unsigned long start,
+                                   unsigned long end,
+                                   struct mm_walk *walk)
+{
+        struct migrate_vma *migrate = walk->private;
+        struct vm_area_struct *vma = walk->vma;
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long addr = start;
+        spinlock_t *ptl;
+        pte_t *ptep;
+again:
+        if (pmd_none(*pmdp))
+                return migrate_vma_collect_hole(start, end, walk);
+        if (pmd_trans_huge(*pmdp)) {
+                struct page *page;
+                ptl = pmd_lock(mm, pmdp);
+                if (unlikely(!pmd_trans_huge(*pmdp))) {
+                        spin_unlock(ptl);
+                        goto again;
+                }
+                page = pmd_page(*pmdp);
+                if (is_huge_zero_page(page)) {
+                        spin_unlock(ptl);
+                        split_huge_pmd(vma, pmdp, addr);
+                        if (pmd_trans_unstable(pmdp))
+                                return migrate_vma_collect_hole(start, end,
+                                                                walk);
+                } else {
+                        int ret;
+                        get_page(page);
+                        spin_unlock(ptl);
+                        if (unlikely(!trylock_page(page)))
+                                return migrate_vma_collect_hole(start, end,
+                                                                walk);
+                        ret = split_huge_page(page);
+                        unlock_page(page);
+                        put_page(page);
+                        if (ret || pmd_none(*pmdp))
+                                return migrate_vma_collect_hole(start, end,
+                                                                walk);
+                }
+        }
+        if (unlikely(pmd_bad(*pmdp)))
+                return migrate_vma_collect_hole(start, end, walk);
+        ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+        for (; addr < end; addr += PAGE_SIZE, ptep++) {
+                unsigned long mpfn, pfn;
+                struct page *page;
+                pte_t pte;
+                pte = *ptep;
+                pfn = pte_pfn(pte);
+                if (!pte_present(pte)) {
+                        mpfn = pfn = 0;
+                        goto next;
+                }
+                /* FIXME support THP */
+                page = vm_normal_page(migrate->vma, addr, pte);
+                if (!page || !page->mapping || PageTransCompound(page)) {
+                        mpfn = pfn = 0;
+                        goto next;
+                }
+                /*
+                 * By getting a reference on the page we pin it and that blocks
+                 * any kind of migration. Side effect is that it "freezes" the
+                 * pte.
+                 *
+                 * We drop this reference after isolating the page from the lru
+                 * for non device page (device page are not on the lru and thus
+                 * can't be dropped from it).
+                 */
+                get_page(page);
+                migrate->cpages++;
+                mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+                mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+next:
+                migrate->src[migrate->npages++] = mpfn;
+        }
+        pte_unmap_unlock(ptep - 1, ptl);
+        return 0;
+}
+/*
+ * migrate_vma_collect() - collect pages over a range of virtual addresses
+ * @migrate: migrate struct containing all migration information
+ *
+ * This will walk the CPU page table. For each virtual address backed by a
+ * valid page, it updates the src array and takes a reference on the page, in
+ * order to pin the page until we lock it and unmap it.
+ */
+static void migrate_vma_collect(struct migrate_vma *migrate)
+{
+        struct mm_walk mm_walk;
+        mm_walk.pmd_entry = migrate_vma_collect_pmd;
+        mm_walk.pte_entry = NULL;
+        mm_walk.pte_hole = migrate_vma_collect_hole;
+        mm_walk.hugetlb_entry = NULL;
+        mm_walk.test_walk = NULL;
+        mm_walk.vma = migrate->vma;
+        mm_walk.mm = migrate->vma->vm_mm;
+        mm_walk.private = migrate;
+        walk_page_range(migrate->start, migrate->end, &mm_walk);
+        migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
+}
+/*
+ * migrate_vma_check_page() - check if page is pinned or not
+ * @page: struct page to check
+ *
+ * Pinned pages cannot be migrated. This is the same test as in
+ * migrate_page_move_mapping(), except that here we allow migration of a
+ * ZONE_DEVICE page.
+ */
+static bool migrate_vma_check_page(struct page *page)
+{
+        /*
+         * One extra ref because caller holds an extra reference, either from
+         * isolate_lru_page() for a regular page, or migrate_vma_collect() for
+         * a device page.
+         */
+        int extra = 1;
+        /*
+         * FIXME support THP (transparent huge page), it is bit more complex to
+         * check them than regular pages, because they can be mapped with a pmd
+         * or with a pte (split pte mapping).
+         */
+        if (PageCompound(page))
+                return false;
+        if ((page_count(page) - extra) > page_mapcount(page))
+                return false;
+        return true;
+}
+/*
+ * migrate_vma_prepare() - lock pages and isolate them from the lru
+ * @migrate: migrate struct containing all migration information
+ *
+ * This locks pages that have been collected by migrate_vma_collect(). Once each
+ * page is locked it is isolated from the lru (for non-device pages). Finally,
+ * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
+ * migrated by concurrent kernel threads.
+ */
+static void migrate_vma_prepare(struct migrate_vma *migrate)
+{
+        const unsigned long npages = migrate->npages;
+        bool allow_drain = true;
+        unsigned long i;
+        lru_add_drain();
+        for (i = 0; (i < npages) && migrate->cpages; i++) {
+                struct page *page = migrate_pfn_to_page(migrate->src[i]);
+                if (!page)
+                        continue;
+                /*
+                 * Because we are migrating several pages there can be
+                 * a deadlock between 2 concurrent migration where each
+                 * are waiting on each other page lock.
+                 *
+                 * Make migrate_vma() a best effort thing and backoff
+                 * for any page we can not lock right away.
+                 */
+                if (!trylock_page(page)) {
+                        migrate->src[i] = 0;
+                        migrate->cpages--;
+                        put_page(page);
+                        continue;
+                }
+                migrate->src[i] |= MIGRATE_PFN_LOCKED;
+                if (!PageLRU(page) && allow_drain) {
+                        /* Drain CPU's pagevec */
+                        lru_add_drain_all();
+                        allow_drain = false;
+                }
+                if (isolate_lru_page(page)) {
+                        migrate->src[i] = 0;
+                        unlock_page(page);
+                        migrate->cpages--;
+                        put_page(page);
+                        continue;
+                }
+                if (!migrate_vma_check_page(page)) {
+                        migrate->src[i] = 0;
+                        unlock_page(page);
+                        migrate->cpages--;
+                        putback_lru_page(page);
+                }
+        }
+}
+/*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Replace page mapping (CPU page table pte) with a special migration pte entry
+ * and check again if it has been pinned. Pinned pages are restored because we
+ * cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+static void migrate_vma_unmap(struct migrate_vma *migrate)
+{
+        int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+        const unsigned long npages = migrate->npages;
+        const unsigned long start = migrate->start;
+        unsigned long addr, i, restore = 0;
+        for (i = 0; i < npages; i++) {
+                struct page *page = migrate_pfn_to_page(migrate->src[i]);
+                if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+                        continue;
+                try_to_unmap(page, flags);
+                if (page_mapped(page) || !migrate_vma_check_page(page)) {
+                        migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+                        migrate->cpages--;
+                        restore++;
+                }
+        }
+        for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
+                struct page *page = migrate_pfn_to_page(migrate->src[i]);
+                if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+                        continue;
+                remove_migration_ptes(page, page, false);
+                migrate->src[i] = 0;
+                unlock_page(page);
+                restore--;
+                putback_lru_page(page);
+        }
+}
+/*
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
+ * @migrate: migrate struct containing all migration information
+ *
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+static void migrate_vma_pages(struct migrate_vma *migrate)
+{
+        const unsigned long npages = migrate->npages;
+        const unsigned long start = migrate->start;
+        unsigned long addr, i;
+        for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
+                struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+                struct page *page = migrate_pfn_to_page(migrate->src[i]);
+                struct address_space *mapping;
+                int r;
+                if (!page || !newpage)
+                        continue;
+                if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+                        continue;
+                mapping = page_mapping(page);
+                r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
+                if (r != MIGRATEPAGE_SUCCESS)
+                        migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+        }
+}
+/*
+ * migrate_vma_finalize() - restore CPU page table entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * This replaces the special migration pte entry with either a mapping to the
+ * new page if migration was successful for that page, or to the original page
+ * otherwise.
+ *
+ * This also unlocks the pages and puts them back on the lru, or drops the extra
+ * refcount, for device pages.
+ */
+static void migrate_vma_finalize(struct migrate_vma *migrate)
+{
+        const unsigned long npages = migrate->npages;
+        unsigned long i;
+        for (i = 0; i < npages; i++) {
+                struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+                struct page *page = migrate_pfn_to_page(migrate->src[i]);
+                if (!page)
+                        continue;
+                if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+                        if (newpage) {
+                                unlock_page(newpage);
+                                put_page(newpage);
+                        }
+                        newpage = page;
+                }
+                remove_migration_ptes(page, newpage, false);
+                unlock_page(page);
+                migrate->cpages--;
+                putback_lru_page(page);
+                if (newpage != page) {
+                        unlock_page(newpage);
+                        putback_lru_page(newpage);
+                }
+        }
+}
+/*
+ * migrate_vma() - migrate a range of memory inside vma
+ *
+ * @ops: migration callback for allocating destination memory and copying
+ * @vma: virtual memory area containing the range to be migrated
+ * @start: start address of the range to migrate (inclusive)
+ * @end: end address of the range to migrate (exclusive)
+ * @src: array of hmm_pfn_t containing source pfns
+ * @dst: array of hmm_pfn_t containing destination pfns
+ * @private: pointer passed back to each of the callback
+ * Returns: 0 on success, error code otherwise
+ *
+ * This function tries to migrate a range of memory virtual address range, using
+ * callbacks to allocate and copy memory from source to destination. First it
+ * collects all the pages backing each virtual address in the range, saving this
+ * inside the src array. Then it locks those pages and unmaps them. Once the pages
+ * are locked and unmapped, it checks whether each page is pinned or not. Pages
+ * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
+ * in the corresponding src array entry. It then restores any pages that are
+ * pinned, by remapping and unlocking those pages.
+ *
+ * At this point it calls the alloc_and_copy() callback. For documentation on
+ * what is expected from that callback, see struct migrate_vma_ops comments in
+ * include/linux/migrate.h
+ *
+ * After the alloc_and_copy() callback, this function goes over each entry in
+ * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
+ * then the function tries to migrate struct page information from the source
+ * struct page to the destination struct page. If it fails to migrate the struct
+ * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
+ * array.
+ *
+ * At this point all successfully migrated pages have an entry in the src
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
+ * array entry with MIGRATE_PFN_VALID flag set.
+ *
+ * It then calls the finalize_and_map() callback. See comments for "struct
+ * migrate_vma_ops", in include/linux/migrate.h for details about
+ * finalize_and_map() behavior.
+ *
+ * After the finalize_and_map() callback, for successfully migrated pages, this
+ * function updates the CPU page table to point to new pages, otherwise it
+ * restores the CPU page table to point to the original source pages.
+ *
+ * Function returns 0 after the above steps, even if no pages were migrated
+ * (The function only returns an error if any of the arguments are invalid.)
+ *
+ * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
+ * unsigned long entries.
+ */
+int migrate_vma(const struct migrate_vma_ops *ops,
+                struct vm_area_struct *vma,
+                unsigned long start,
+                unsigned long end,
+                unsigned long *src,
+                unsigned long *dst,
+                void *private)
+{
+        struct migrate_vma migrate;
+        /* Sanity check the arguments */
+        start &= PAGE_MASK;
+        end &= PAGE_MASK;
+        if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
+                return -EINVAL;
+        if (start < vma->vm_start || start >= vma->vm_end)
+                return -EINVAL;
+        if (end <= vma->vm_start || end > vma->vm_end)
+                return -EINVAL;
+        if (!ops || !src || !dst || start >= end)
+                return -EINVAL;
+        memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
+        migrate.src = src;
+        migrate.dst = dst;
+        migrate.start = start;
+        migrate.npages = 0;
+        migrate.cpages = 0;
+        migrate.end = end;
+        migrate.vma = vma;
+        /* Collect, and try to unmap source pages */
+        migrate_vma_collect(&migrate);
+        if (!migrate.cpages)
+                return 0;
+        /* Lock and isolate page */
+        migrate_vma_prepare(&migrate);
+        if (!migrate.cpages)
+                return 0;
+        /* Unmap pages */
+        migrate_vma_unmap(&migrate);
+        if (!migrate.cpages)
+                return 0;
+        /*
+         * At this point pages are locked and unmapped, and thus they have
+         * stable content and can safely be copied to destination memory that
+         * is allocated by the callback.
+         *
+         * Note that migration can fail in migrate_vma_struct_page() for each
+         * individual page.
+         */
+        ops->alloc_and_copy(vma, src, dst, start, end, private);
+        /* This does the real migration of struct page */
+        migrate_vma_pages(&migrate);
+        ops->finalize_and_map(vma, src, dst, start, end, private);
+        /* Unlock and remap pages */
+        migrate_vma_finalize(&migrate);
+        return 0;
+}
+EXPORT_SYMBOL(migrate_vma);
author	Jérôme Glisse <jglisse@redhat.com>	2017-09-08 19:12:09 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-09-08 21:26:46 -0400
commit	8763cb45ab967a92a5ee49e9c544c0f0ea90e2d6 (patch)
tree	a2b5041d068fd69ee8a60c6c3ec8adb004ad0ced /mm/migrate.c
parent	2916ecc0f9d435d849c98f4da50e453124c87531 (diff)

diff --git a/mm/migrate.c b/mm/migrate.c index 71de36cfb673..991e8886093f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c
@@ -428,6 +428,14 @@ int migrate_page_move_mapping(struct address_space *mapping,
428	int expected_count = 1 + extra_count;	428	int expected_count = 1 + extra_count;
429	void **pslot;	429	void **pslot;
430		430
		431	/*
		432	* ZONE_DEVICE pages have 1 refcount always held by their device
		433	*
		434	* Note that DAX memory will never reach that point as it does not have
		435	* the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h).
		436	*/
		437	expected_count += is_zone_device_page(page);
		438
431	if (!mapping) {	439	if (!mapping) {
432	/* Anonymous page without mapping */	440	/* Anonymous page without mapping */
433	if (page_count(page) != expected_count)	441	if (page_count(page) != expected_count)
@@ -2106,3 +2114,487 @@ out_unlock:
2106	#endif /* CONFIG_NUMA_BALANCING */	2114	#endif /* CONFIG_NUMA_BALANCING */
2107		2115
2108	#endif /* CONFIG_NUMA */	2116	#endif /* CONFIG_NUMA */
		2117
		2118
		2119	struct migrate_vma {
		2120	struct vm_area_struct *vma;
		2121	unsigned long *dst;
		2122	unsigned long *src;
		2123	unsigned long cpages;
		2124	unsigned long npages;
		2125	unsigned long start;
		2126	unsigned long end;
		2127	};
		2128
		2129	static int migrate_vma_collect_hole(unsigned long start,
		2130	unsigned long end,
		2131	struct mm_walk *walk)
		2132	{
		2133	struct migrate_vma *migrate = walk->private;
		2134	unsigned long addr;
		2135
		2136	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
		2137	migrate->dst[migrate->npages] = 0;
		2138	migrate->src[migrate->npages++] = 0;
		2139	}
		2140
		2141	return 0;
		2142	}
		2143
		2144	static int migrate_vma_collect_pmd(pmd_t *pmdp,
		2145	unsigned long start,
		2146	unsigned long end,
		2147	struct mm_walk *walk)
		2148	{
		2149	struct migrate_vma *migrate = walk->private;
		2150	struct vm_area_struct *vma = walk->vma;
		2151	struct mm_struct *mm = vma->vm_mm;
		2152	unsigned long addr = start;
		2153	spinlock_t *ptl;
		2154	pte_t *ptep;
		2155
		2156	again:
		2157	if (pmd_none(*pmdp))
		2158	return migrate_vma_collect_hole(start, end, walk);
		2159
		2160	if (pmd_trans_huge(*pmdp)) {
		2161	struct page *page;
		2162
		2163	ptl = pmd_lock(mm, pmdp);
		2164	if (unlikely(!pmd_trans_huge(*pmdp))) {
		2165	spin_unlock(ptl);
		2166	goto again;
		2167	}
		2168
		2169	page = pmd_page(*pmdp);
		2170	if (is_huge_zero_page(page)) {
		2171	spin_unlock(ptl);
		2172	split_huge_pmd(vma, pmdp, addr);
		2173	if (pmd_trans_unstable(pmdp))
		2174	return migrate_vma_collect_hole(start, end,
		2175	walk);
		2176	} else {
		2177	int ret;
		2178
		2179	get_page(page);
		2180	spin_unlock(ptl);
		2181	if (unlikely(!trylock_page(page)))
		2182	return migrate_vma_collect_hole(start, end,
		2183	walk);
		2184	ret = split_huge_page(page);
		2185	unlock_page(page);
		2186	put_page(page);
		2187	if (ret \|\| pmd_none(*pmdp))
		2188	return migrate_vma_collect_hole(start, end,
		2189	walk);
		2190	}
		2191	}
		2192
		2193	if (unlikely(pmd_bad(*pmdp)))
		2194	return migrate_vma_collect_hole(start, end, walk);
		2195
		2196	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
		2197	for (; addr < end; addr += PAGE_SIZE, ptep++) {
		2198	unsigned long mpfn, pfn;
		2199	struct page *page;
		2200	pte_t pte;
		2201
		2202	pte = *ptep;
		2203	pfn = pte_pfn(pte);
		2204
		2205	if (!pte_present(pte)) {
		2206	mpfn = pfn = 0;
		2207	goto next;
		2208	}
		2209
		2210	/* FIXME support THP */
		2211	page = vm_normal_page(migrate->vma, addr, pte);
		2212	if (!page \|\| !page->mapping \|\| PageTransCompound(page)) {
		2213	mpfn = pfn = 0;
		2214	goto next;
		2215	}
		2216
		2217	/*
		2218	* By getting a reference on the page we pin it and that blocks
		2219	* any kind of migration. Side effect is that it "freezes" the
		2220	* pte.
		2221	*
		2222	* We drop this reference after isolating the page from the lru
		2223	* for non device page (device page are not on the lru and thus
		2224	* can't be dropped from it).
		2225	*/
		2226	get_page(page);
		2227	migrate->cpages++;
		2228	mpfn = migrate_pfn(pfn) \| MIGRATE_PFN_MIGRATE;
		2229	mpfn \|= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
		2230
		2231	next:
		2232	migrate->src[migrate->npages++] = mpfn;
		2233	}
		2234	pte_unmap_unlock(ptep - 1, ptl);
		2235
		2236	return 0;
		2237	}
		2238
		2239	/*
		2240	* migrate_vma_collect() - collect pages over a range of virtual addresses
		2241	* @migrate: migrate struct containing all migration information
		2242	*
		2243	* This will walk the CPU page table. For each virtual address backed by a
		2244	* valid page, it updates the src array and takes a reference on the page, in
		2245	* order to pin the page until we lock it and unmap it.
		2246	*/
		2247	static void migrate_vma_collect(struct migrate_vma *migrate)
		2248	{
		2249	struct mm_walk mm_walk;
		2250
		2251	mm_walk.pmd_entry = migrate_vma_collect_pmd;
		2252	mm_walk.pte_entry = NULL;
		2253	mm_walk.pte_hole = migrate_vma_collect_hole;
		2254	mm_walk.hugetlb_entry = NULL;
		2255	mm_walk.test_walk = NULL;
		2256	mm_walk.vma = migrate->vma;
		2257	mm_walk.mm = migrate->vma->vm_mm;
		2258	mm_walk.private = migrate;
		2259
		2260	walk_page_range(migrate->start, migrate->end, &mm_walk);
		2261
		2262	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
		2263	}
		2264
		2265	/*
		2266	* migrate_vma_check_page() - check if page is pinned or not
		2267	* @page: struct page to check
		2268	*
		2269	* Pinned pages cannot be migrated. This is the same test as in
		2270	* migrate_page_move_mapping(), except that here we allow migration of a
		2271	* ZONE_DEVICE page.
		2272	*/
		2273	static bool migrate_vma_check_page(struct page *page)
		2274	{
		2275	/*
		2276	* One extra ref because caller holds an extra reference, either from
		2277	* isolate_lru_page() for a regular page, or migrate_vma_collect() for
		2278	* a device page.
		2279	*/
		2280	int extra = 1;
		2281
		2282	/*
		2283	* FIXME support THP (transparent huge page), it is bit more complex to
		2284	* check them than regular pages, because they can be mapped with a pmd
		2285	* or with a pte (split pte mapping).
		2286	*/
		2287	if (PageCompound(page))
		2288	return false;
		2289
		2290	if ((page_count(page) - extra) > page_mapcount(page))
		2291	return false;
		2292
		2293	return true;
		2294	}
		2295
		2296	/*
		2297	* migrate_vma_prepare() - lock pages and isolate them from the lru
		2298	* @migrate: migrate struct containing all migration information
		2299	*
		2300	* This locks pages that have been collected by migrate_vma_collect(). Once each
		2301	* page is locked it is isolated from the lru (for non-device pages). Finally,
		2302	* the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
		2303	* migrated by concurrent kernel threads.
		2304	*/
		2305	static void migrate_vma_prepare(struct migrate_vma *migrate)
		2306	{
		2307	const unsigned long npages = migrate->npages;
		2308	bool allow_drain = true;
		2309	unsigned long i;
		2310
		2311	lru_add_drain();
		2312
		2313	for (i = 0; (i < npages) && migrate->cpages; i++) {
		2314	struct page *page = migrate_pfn_to_page(migrate->src[i]);
		2315
		2316	if (!page)
		2317	continue;
		2318
		2319	/*
		2320	* Because we are migrating several pages there can be
		2321	* a deadlock between 2 concurrent migration where each
		2322	* are waiting on each other page lock.
		2323	*
		2324	* Make migrate_vma() a best effort thing and backoff
		2325	* for any page we can not lock right away.
		2326	*/
		2327	if (!trylock_page(page)) {
		2328	migrate->src[i] = 0;
		2329	migrate->cpages--;
		2330	put_page(page);
		2331	continue;
		2332	}
		2333	migrate->src[i] \|= MIGRATE_PFN_LOCKED;
		2334
		2335	if (!PageLRU(page) && allow_drain) {
		2336	/* Drain CPU's pagevec */
		2337	lru_add_drain_all();
		2338	allow_drain = false;
		2339	}
		2340
		2341	if (isolate_lru_page(page)) {
		2342	migrate->src[i] = 0;
		2343	unlock_page(page);
		2344	migrate->cpages--;
		2345	put_page(page);
		2346	continue;
		2347	}
		2348
		2349	if (!migrate_vma_check_page(page)) {
		2350	migrate->src[i] = 0;
		2351	unlock_page(page);
		2352	migrate->cpages--;
		2353
		2354	putback_lru_page(page);
		2355	}
		2356	}
		2357	}
		2358
		2359	/*
		2360	* migrate_vma_unmap() - replace page mapping with special migration pte entry
		2361	* @migrate: migrate struct containing all migration information
		2362	*
		2363	* Replace page mapping (CPU page table pte) with a special migration pte entry
		2364	* and check again if it has been pinned. Pinned pages are restored because we
		2365	* cannot migrate them.
		2366	*
		2367	* This is the last step before we call the device driver callback to allocate
		2368	* destination memory and copy contents of original page over to new page.
		2369	*/
		2370	static void migrate_vma_unmap(struct migrate_vma *migrate)
		2371	{
		2372	int flags = TTU_MIGRATION \| TTU_IGNORE_MLOCK \| TTU_IGNORE_ACCESS;
		2373	const unsigned long npages = migrate->npages;
		2374	const unsigned long start = migrate->start;
		2375	unsigned long addr, i, restore = 0;
		2376
		2377	for (i = 0; i < npages; i++) {
		2378	struct page *page = migrate_pfn_to_page(migrate->src[i]);
		2379
		2380	if (!page \|\| !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
		2381	continue;
		2382
		2383	try_to_unmap(page, flags);
		2384	if (page_mapped(page) \|\| !migrate_vma_check_page(page)) {
		2385	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
		2386	migrate->cpages--;
		2387	restore++;
		2388	}
		2389	}
		2390
		2391	for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
		2392	struct page *page = migrate_pfn_to_page(migrate->src[i]);
		2393
		2394	if (!page \|\| (migrate->src[i] & MIGRATE_PFN_MIGRATE))
		2395	continue;
		2396
		2397	remove_migration_ptes(page, page, false);
		2398
		2399	migrate->src[i] = 0;
		2400	unlock_page(page);
		2401	restore--;
		2402
		2403	putback_lru_page(page);
		2404	}
		2405	}
		2406
		2407	/*
		2408	* migrate_vma_pages() - migrate meta-data from src page to dst page
		2409	* @migrate: migrate struct containing all migration information
		2410	*
		2411	* This migrates struct page meta-data from source struct page to destination
		2412	* struct page. This effectively finishes the migration from source page to the
		2413	* destination page.
		2414	*/
		2415	static void migrate_vma_pages(struct migrate_vma *migrate)
		2416	{
		2417	const unsigned long npages = migrate->npages;
		2418	const unsigned long start = migrate->start;
		2419	unsigned long addr, i;
		2420
		2421	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
		2422	struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
		2423	struct page *page = migrate_pfn_to_page(migrate->src[i]);
		2424	struct address_space *mapping;
		2425	int r;
		2426
		2427	if (!page \|\| !newpage)
		2428	continue;
		2429	if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
		2430	continue;
		2431
		2432	mapping = page_mapping(page);
		2433
		2434	r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
		2435	if (r != MIGRATEPAGE_SUCCESS)
		2436	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
		2437	}
		2438	}
		2439
		2440	/*
		2441	* migrate_vma_finalize() - restore CPU page table entry
		2442	* @migrate: migrate struct containing all migration information
		2443	*
		2444	* This replaces the special migration pte entry with either a mapping to the
		2445	* new page if migration was successful for that page, or to the original page
		2446	* otherwise.
		2447	*
		2448	* This also unlocks the pages and puts them back on the lru, or drops the extra
		2449	* refcount, for device pages.
		2450	*/
		2451	static void migrate_vma_finalize(struct migrate_vma *migrate)
		2452	{
		2453	const unsigned long npages = migrate->npages;
		2454	unsigned long i;
		2455
		2456	for (i = 0; i < npages; i++) {
		2457	struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
		2458	struct page *page = migrate_pfn_to_page(migrate->src[i]);
		2459
		2460	if (!page)
		2461	continue;
		2462	if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) \|\| !newpage) {
		2463	if (newpage) {
		2464	unlock_page(newpage);
		2465	put_page(newpage);
		2466	}
		2467	newpage = page;
		2468	}
		2469
		2470	remove_migration_ptes(page, newpage, false);
		2471	unlock_page(page);
		2472	migrate->cpages--;
		2473
		2474	putback_lru_page(page);
		2475
		2476	if (newpage != page) {
		2477	unlock_page(newpage);
		2478	putback_lru_page(newpage);
		2479	}
		2480	}
		2481	}
		2482
		2483	/*
		2484	* migrate_vma() - migrate a range of memory inside vma
		2485	*
		2486	* @ops: migration callback for allocating destination memory and copying
		2487	* @vma: virtual memory area containing the range to be migrated
		2488	* @start: start address of the range to migrate (inclusive)
		2489	* @end: end address of the range to migrate (exclusive)
		2490	* @src: array of hmm_pfn_t containing source pfns
		2491	* @dst: array of hmm_pfn_t containing destination pfns
		2492	* @private: pointer passed back to each of the callback
		2493	* Returns: 0 on success, error code otherwise
		2494	*
		2495	* This function tries to migrate a range of memory virtual address range, using
		2496	* callbacks to allocate and copy memory from source to destination. First it
		2497	* collects all the pages backing each virtual address in the range, saving this
		2498	* inside the src array. Then it locks those pages and unmaps them. Once the pages
		2499	* are locked and unmapped, it checks whether each page is pinned or not. Pages
		2500	* that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
		2501	* in the corresponding src array entry. It then restores any pages that are
		2502	* pinned, by remapping and unlocking those pages.
		2503	*
		2504	* At this point it calls the alloc_and_copy() callback. For documentation on
		2505	* what is expected from that callback, see struct migrate_vma_ops comments in
		2506	* include/linux/migrate.h
		2507	*
		2508	* After the alloc_and_copy() callback, this function goes over each entry in
		2509	* the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
		2510	* set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
		2511	* then the function tries to migrate struct page information from the source
		2512	* struct page to the destination struct page. If it fails to migrate the struct
		2513	* page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
		2514	* array.
		2515	*
		2516	* At this point all successfully migrated pages have an entry in the src
		2517	* array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
		2518	* array entry with MIGRATE_PFN_VALID flag set.
		2519	*
		2520	* It then calls the finalize_and_map() callback. See comments for "struct
		2521	* migrate_vma_ops", in include/linux/migrate.h for details about
		2522	* finalize_and_map() behavior.
		2523	*
		2524	* After the finalize_and_map() callback, for successfully migrated pages, this
		2525	* function updates the CPU page table to point to new pages, otherwise it
		2526	* restores the CPU page table to point to the original source pages.
		2527	*
		2528	* Function returns 0 after the above steps, even if no pages were migrated
		2529	* (The function only returns an error if any of the arguments are invalid.)
		2530	*
		2531	* Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
		2532	* unsigned long entries.
		2533	*/
		2534	int migrate_vma(const struct migrate_vma_ops *ops,
		2535	struct vm_area_struct *vma,
		2536	unsigned long start,
		2537	unsigned long end,
		2538	unsigned long *src,
		2539	unsigned long *dst,
		2540	void *private)
		2541	{
		2542	struct migrate_vma migrate;
		2543
		2544	/* Sanity check the arguments */
		2545	start &= PAGE_MASK;
		2546	end &= PAGE_MASK;
		2547	if (!vma \|\| is_vm_hugetlb_page(vma) \|\| (vma->vm_flags & VM_SPECIAL))
		2548	return -EINVAL;
		2549	if (start < vma->vm_start \|\| start >= vma->vm_end)
		2550	return -EINVAL;
		2551	if (end <= vma->vm_start \|\| end > vma->vm_end)
		2552	return -EINVAL;
		2553	if (!ops \|\| !src \|\| !dst \|\| start >= end)
		2554	return -EINVAL;
		2555
		2556	memset(src, 0, sizeof(src) ((end - start) >> PAGE_SHIFT));
		2557	migrate.src = src;
		2558	migrate.dst = dst;
		2559	migrate.start = start;
		2560	migrate.npages = 0;
		2561	migrate.cpages = 0;
		2562	migrate.end = end;
		2563	migrate.vma = vma;
		2564
		2565	/* Collect, and try to unmap source pages */
		2566	migrate_vma_collect(&migrate);
		2567	if (!migrate.cpages)
		2568	return 0;
		2569
		2570	/* Lock and isolate page */
		2571	migrate_vma_prepare(&migrate);
		2572	if (!migrate.cpages)
		2573	return 0;
		2574
		2575	/* Unmap pages */
		2576	migrate_vma_unmap(&migrate);
		2577	if (!migrate.cpages)
		2578	return 0;
		2579
		2580	/*
		2581	* At this point pages are locked and unmapped, and thus they have
		2582	* stable content and can safely be copied to destination memory that
		2583	* is allocated by the callback.
		2584	*
		2585	* Note that migration can fail in migrate_vma_struct_page() for each
		2586	* individual page.
		2587	*/
		2588	ops->alloc_and_copy(vma, src, dst, start, end, private);
		2589
		2590	/* This does the real migration of struct page */
		2591	migrate_vma_pages(&migrate);
		2592
		2593	ops->finalize_and_map(vma, src, dst, start, end, private);
		2594
		2595	/* Unlock and remap pages */
		2596	migrate_vma_finalize(&migrate);
		2597
		2598	return 0;
		2599	}
		2600	EXPORT_SYMBOL(migrate_vma);