summaryrefslogtreecommitdiffstats
path: root/mm/migrate.c
diff options
context:
space:
mode:
authorJérôme Glisse <jglisse@redhat.com>2017-09-08 19:12:09 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-08 21:26:46 -0400
commit8763cb45ab967a92a5ee49e9c544c0f0ea90e2d6 (patch)
treea2b5041d068fd69ee8a60c6c3ec8adb004ad0ced /mm/migrate.c
parent2916ecc0f9d435d849c98f4da50e453124c87531 (diff)
mm/migrate: new memory migration helper for use with device memory
This patch add a new memory migration helpers, which migrate memory backing a range of virtual address of a process to different memory (which can be allocated through special allocator). It differs from numa migration by working on a range of virtual address and thus by doing migration in chunk that can be large enough to use DMA engine or special copy offloading engine. Expected users are any one with heterogeneous memory where different memory have different characteristics (latency, bandwidth, ...). As an example IBM platform with CAPI bus can make use of this feature to migrate between regular memory and CAPI device memory. New CPU architecture with a pool of high performance memory not manage as cache but presented as regular memory (while being faster and with lower latency than DDR) will also be prime user of this patch. Migration to private device memory will be useful for device that have large pool of such like GPU, NVidia plans to use HMM for that. Link: http://lkml.kernel.org/r/20170817000548.32038-15-jglisse@redhat.com Signed-off-by: Jérôme Glisse <jglisse@redhat.com> Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com> Signed-off-by: John Hubbard <jhubbard@nvidia.com> Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com> Signed-off-by: Sherry Cheung <SCheung@nvidia.com> Signed-off-by: Subhash Gutti <sgutti@nvidia.com> Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Nellans <dnellans@nvidia.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Bob Liu <liubo95@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/migrate.c')
-rw-r--r--mm/migrate.c492
1 files changed, 492 insertions, 0 deletions
diff --git a/mm/migrate.c b/mm/migrate.c
index 71de36cfb673..991e8886093f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -428,6 +428,14 @@ int migrate_page_move_mapping(struct address_space *mapping,
428 int expected_count = 1 + extra_count; 428 int expected_count = 1 + extra_count;
429 void **pslot; 429 void **pslot;
430 430
431 /*
432 * ZONE_DEVICE pages have 1 refcount always held by their device
433 *
434 * Note that DAX memory will never reach that point as it does not have
435 * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h).
436 */
437 expected_count += is_zone_device_page(page);
438
431 if (!mapping) { 439 if (!mapping) {
432 /* Anonymous page without mapping */ 440 /* Anonymous page without mapping */
433 if (page_count(page) != expected_count) 441 if (page_count(page) != expected_count)
@@ -2106,3 +2114,487 @@ out_unlock:
2106#endif /* CONFIG_NUMA_BALANCING */ 2114#endif /* CONFIG_NUMA_BALANCING */
2107 2115
2108#endif /* CONFIG_NUMA */ 2116#endif /* CONFIG_NUMA */
2117
2118
2119struct migrate_vma {
2120 struct vm_area_struct *vma;
2121 unsigned long *dst;
2122 unsigned long *src;
2123 unsigned long cpages;
2124 unsigned long npages;
2125 unsigned long start;
2126 unsigned long end;
2127};
2128
2129static int migrate_vma_collect_hole(unsigned long start,
2130 unsigned long end,
2131 struct mm_walk *walk)
2132{
2133 struct migrate_vma *migrate = walk->private;
2134 unsigned long addr;
2135
2136 for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
2137 migrate->dst[migrate->npages] = 0;
2138 migrate->src[migrate->npages++] = 0;
2139 }
2140
2141 return 0;
2142}
2143
2144static int migrate_vma_collect_pmd(pmd_t *pmdp,
2145 unsigned long start,
2146 unsigned long end,
2147 struct mm_walk *walk)
2148{
2149 struct migrate_vma *migrate = walk->private;
2150 struct vm_area_struct *vma = walk->vma;
2151 struct mm_struct *mm = vma->vm_mm;
2152 unsigned long addr = start;
2153 spinlock_t *ptl;
2154 pte_t *ptep;
2155
2156again:
2157 if (pmd_none(*pmdp))
2158 return migrate_vma_collect_hole(start, end, walk);
2159
2160 if (pmd_trans_huge(*pmdp)) {
2161 struct page *page;
2162
2163 ptl = pmd_lock(mm, pmdp);
2164 if (unlikely(!pmd_trans_huge(*pmdp))) {
2165 spin_unlock(ptl);
2166 goto again;
2167 }
2168
2169 page = pmd_page(*pmdp);
2170 if (is_huge_zero_page(page)) {
2171 spin_unlock(ptl);
2172 split_huge_pmd(vma, pmdp, addr);
2173 if (pmd_trans_unstable(pmdp))
2174 return migrate_vma_collect_hole(start, end,
2175 walk);
2176 } else {
2177 int ret;
2178
2179 get_page(page);
2180 spin_unlock(ptl);
2181 if (unlikely(!trylock_page(page)))
2182 return migrate_vma_collect_hole(start, end,
2183 walk);
2184 ret = split_huge_page(page);
2185 unlock_page(page);
2186 put_page(page);
2187 if (ret || pmd_none(*pmdp))
2188 return migrate_vma_collect_hole(start, end,
2189 walk);
2190 }
2191 }
2192
2193 if (unlikely(pmd_bad(*pmdp)))
2194 return migrate_vma_collect_hole(start, end, walk);
2195
2196 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
2197 for (; addr < end; addr += PAGE_SIZE, ptep++) {
2198 unsigned long mpfn, pfn;
2199 struct page *page;
2200 pte_t pte;
2201
2202 pte = *ptep;
2203 pfn = pte_pfn(pte);
2204
2205 if (!pte_present(pte)) {
2206 mpfn = pfn = 0;
2207 goto next;
2208 }
2209
2210 /* FIXME support THP */
2211 page = vm_normal_page(migrate->vma, addr, pte);
2212 if (!page || !page->mapping || PageTransCompound(page)) {
2213 mpfn = pfn = 0;
2214 goto next;
2215 }
2216
2217 /*
2218 * By getting a reference on the page we pin it and that blocks
2219 * any kind of migration. Side effect is that it "freezes" the
2220 * pte.
2221 *
2222 * We drop this reference after isolating the page from the lru
2223 * for non device page (device page are not on the lru and thus
2224 * can't be dropped from it).
2225 */
2226 get_page(page);
2227 migrate->cpages++;
2228 mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
2229 mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
2230
2231next:
2232 migrate->src[migrate->npages++] = mpfn;
2233 }
2234 pte_unmap_unlock(ptep - 1, ptl);
2235
2236 return 0;
2237}
2238
2239/*
2240 * migrate_vma_collect() - collect pages over a range of virtual addresses
2241 * @migrate: migrate struct containing all migration information
2242 *
2243 * This will walk the CPU page table. For each virtual address backed by a
2244 * valid page, it updates the src array and takes a reference on the page, in
2245 * order to pin the page until we lock it and unmap it.
2246 */
2247static void migrate_vma_collect(struct migrate_vma *migrate)
2248{
2249 struct mm_walk mm_walk;
2250
2251 mm_walk.pmd_entry = migrate_vma_collect_pmd;
2252 mm_walk.pte_entry = NULL;
2253 mm_walk.pte_hole = migrate_vma_collect_hole;
2254 mm_walk.hugetlb_entry = NULL;
2255 mm_walk.test_walk = NULL;
2256 mm_walk.vma = migrate->vma;
2257 mm_walk.mm = migrate->vma->vm_mm;
2258 mm_walk.private = migrate;
2259
2260 walk_page_range(migrate->start, migrate->end, &mm_walk);
2261
2262 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
2263}
2264
2265/*
2266 * migrate_vma_check_page() - check if page is pinned or not
2267 * @page: struct page to check
2268 *
2269 * Pinned pages cannot be migrated. This is the same test as in
2270 * migrate_page_move_mapping(), except that here we allow migration of a
2271 * ZONE_DEVICE page.
2272 */
2273static bool migrate_vma_check_page(struct page *page)
2274{
2275 /*
2276 * One extra ref because caller holds an extra reference, either from
2277 * isolate_lru_page() for a regular page, or migrate_vma_collect() for
2278 * a device page.
2279 */
2280 int extra = 1;
2281
2282 /*
2283 * FIXME support THP (transparent huge page), it is bit more complex to
2284 * check them than regular pages, because they can be mapped with a pmd
2285 * or with a pte (split pte mapping).
2286 */
2287 if (PageCompound(page))
2288 return false;
2289
2290 if ((page_count(page) - extra) > page_mapcount(page))
2291 return false;
2292
2293 return true;
2294}
2295
2296/*
2297 * migrate_vma_prepare() - lock pages and isolate them from the lru
2298 * @migrate: migrate struct containing all migration information
2299 *
2300 * This locks pages that have been collected by migrate_vma_collect(). Once each
2301 * page is locked it is isolated from the lru (for non-device pages). Finally,
2302 * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
2303 * migrated by concurrent kernel threads.
2304 */
2305static void migrate_vma_prepare(struct migrate_vma *migrate)
2306{
2307 const unsigned long npages = migrate->npages;
2308 bool allow_drain = true;
2309 unsigned long i;
2310
2311 lru_add_drain();
2312
2313 for (i = 0; (i < npages) && migrate->cpages; i++) {
2314 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2315
2316 if (!page)
2317 continue;
2318
2319 /*
2320 * Because we are migrating several pages there can be
2321 * a deadlock between 2 concurrent migration where each
2322 * are waiting on each other page lock.
2323 *
2324 * Make migrate_vma() a best effort thing and backoff
2325 * for any page we can not lock right away.
2326 */
2327 if (!trylock_page(page)) {
2328 migrate->src[i] = 0;
2329 migrate->cpages--;
2330 put_page(page);
2331 continue;
2332 }
2333 migrate->src[i] |= MIGRATE_PFN_LOCKED;
2334
2335 if (!PageLRU(page) && allow_drain) {
2336 /* Drain CPU's pagevec */
2337 lru_add_drain_all();
2338 allow_drain = false;
2339 }
2340
2341 if (isolate_lru_page(page)) {
2342 migrate->src[i] = 0;
2343 unlock_page(page);
2344 migrate->cpages--;
2345 put_page(page);
2346 continue;
2347 }
2348
2349 if (!migrate_vma_check_page(page)) {
2350 migrate->src[i] = 0;
2351 unlock_page(page);
2352 migrate->cpages--;
2353
2354 putback_lru_page(page);
2355 }
2356 }
2357}
2358
2359/*
2360 * migrate_vma_unmap() - replace page mapping with special migration pte entry
2361 * @migrate: migrate struct containing all migration information
2362 *
2363 * Replace page mapping (CPU page table pte) with a special migration pte entry
2364 * and check again if it has been pinned. Pinned pages are restored because we
2365 * cannot migrate them.
2366 *
2367 * This is the last step before we call the device driver callback to allocate
2368 * destination memory and copy contents of original page over to new page.
2369 */
2370static void migrate_vma_unmap(struct migrate_vma *migrate)
2371{
2372 int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
2373 const unsigned long npages = migrate->npages;
2374 const unsigned long start = migrate->start;
2375 unsigned long addr, i, restore = 0;
2376
2377 for (i = 0; i < npages; i++) {
2378 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2379
2380 if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
2381 continue;
2382
2383 try_to_unmap(page, flags);
2384 if (page_mapped(page) || !migrate_vma_check_page(page)) {
2385 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2386 migrate->cpages--;
2387 restore++;
2388 }
2389 }
2390
2391 for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
2392 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2393
2394 if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
2395 continue;
2396
2397 remove_migration_ptes(page, page, false);
2398
2399 migrate->src[i] = 0;
2400 unlock_page(page);
2401 restore--;
2402
2403 putback_lru_page(page);
2404 }
2405}
2406
2407/*
2408 * migrate_vma_pages() - migrate meta-data from src page to dst page
2409 * @migrate: migrate struct containing all migration information
2410 *
2411 * This migrates struct page meta-data from source struct page to destination
2412 * struct page. This effectively finishes the migration from source page to the
2413 * destination page.
2414 */
2415static void migrate_vma_pages(struct migrate_vma *migrate)
2416{
2417 const unsigned long npages = migrate->npages;
2418 const unsigned long start = migrate->start;
2419 unsigned long addr, i;
2420
2421 for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
2422 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
2423 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2424 struct address_space *mapping;
2425 int r;
2426
2427 if (!page || !newpage)
2428 continue;
2429 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
2430 continue;
2431
2432 mapping = page_mapping(page);
2433
2434 r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
2435 if (r != MIGRATEPAGE_SUCCESS)
2436 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2437 }
2438}
2439
2440/*
2441 * migrate_vma_finalize() - restore CPU page table entry
2442 * @migrate: migrate struct containing all migration information
2443 *
2444 * This replaces the special migration pte entry with either a mapping to the
2445 * new page if migration was successful for that page, or to the original page
2446 * otherwise.
2447 *
2448 * This also unlocks the pages and puts them back on the lru, or drops the extra
2449 * refcount, for device pages.
2450 */
2451static void migrate_vma_finalize(struct migrate_vma *migrate)
2452{
2453 const unsigned long npages = migrate->npages;
2454 unsigned long i;
2455
2456 for (i = 0; i < npages; i++) {
2457 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
2458 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2459
2460 if (!page)
2461 continue;
2462 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
2463 if (newpage) {
2464 unlock_page(newpage);
2465 put_page(newpage);
2466 }
2467 newpage = page;
2468 }
2469
2470 remove_migration_ptes(page, newpage, false);
2471 unlock_page(page);
2472 migrate->cpages--;
2473
2474 putback_lru_page(page);
2475
2476 if (newpage != page) {
2477 unlock_page(newpage);
2478 putback_lru_page(newpage);
2479 }
2480 }
2481}
2482
2483/*
2484 * migrate_vma() - migrate a range of memory inside vma
2485 *
2486 * @ops: migration callback for allocating destination memory and copying
2487 * @vma: virtual memory area containing the range to be migrated
2488 * @start: start address of the range to migrate (inclusive)
2489 * @end: end address of the range to migrate (exclusive)
2490 * @src: array of hmm_pfn_t containing source pfns
2491 * @dst: array of hmm_pfn_t containing destination pfns
2492 * @private: pointer passed back to each of the callback
2493 * Returns: 0 on success, error code otherwise
2494 *
2495 * This function tries to migrate a range of memory virtual address range, using
2496 * callbacks to allocate and copy memory from source to destination. First it
2497 * collects all the pages backing each virtual address in the range, saving this
2498 * inside the src array. Then it locks those pages and unmaps them. Once the pages
2499 * are locked and unmapped, it checks whether each page is pinned or not. Pages
2500 * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
2501 * in the corresponding src array entry. It then restores any pages that are
2502 * pinned, by remapping and unlocking those pages.
2503 *
2504 * At this point it calls the alloc_and_copy() callback. For documentation on
2505 * what is expected from that callback, see struct migrate_vma_ops comments in
2506 * include/linux/migrate.h
2507 *
2508 * After the alloc_and_copy() callback, this function goes over each entry in
2509 * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
2510 * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
2511 * then the function tries to migrate struct page information from the source
2512 * struct page to the destination struct page. If it fails to migrate the struct
2513 * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
2514 * array.
2515 *
2516 * At this point all successfully migrated pages have an entry in the src
2517 * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
2518 * array entry with MIGRATE_PFN_VALID flag set.
2519 *
2520 * It then calls the finalize_and_map() callback. See comments for "struct
2521 * migrate_vma_ops", in include/linux/migrate.h for details about
2522 * finalize_and_map() behavior.
2523 *
2524 * After the finalize_and_map() callback, for successfully migrated pages, this
2525 * function updates the CPU page table to point to new pages, otherwise it
2526 * restores the CPU page table to point to the original source pages.
2527 *
2528 * Function returns 0 after the above steps, even if no pages were migrated
2529 * (The function only returns an error if any of the arguments are invalid.)
2530 *
2531 * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
2532 * unsigned long entries.
2533 */
2534int migrate_vma(const struct migrate_vma_ops *ops,
2535 struct vm_area_struct *vma,
2536 unsigned long start,
2537 unsigned long end,
2538 unsigned long *src,
2539 unsigned long *dst,
2540 void *private)
2541{
2542 struct migrate_vma migrate;
2543
2544 /* Sanity check the arguments */
2545 start &= PAGE_MASK;
2546 end &= PAGE_MASK;
2547 if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
2548 return -EINVAL;
2549 if (start < vma->vm_start || start >= vma->vm_end)
2550 return -EINVAL;
2551 if (end <= vma->vm_start || end > vma->vm_end)
2552 return -EINVAL;
2553 if (!ops || !src || !dst || start >= end)
2554 return -EINVAL;
2555
2556 memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
2557 migrate.src = src;
2558 migrate.dst = dst;
2559 migrate.start = start;
2560 migrate.npages = 0;
2561 migrate.cpages = 0;
2562 migrate.end = end;
2563 migrate.vma = vma;
2564
2565 /* Collect, and try to unmap source pages */
2566 migrate_vma_collect(&migrate);
2567 if (!migrate.cpages)
2568 return 0;
2569
2570 /* Lock and isolate page */
2571 migrate_vma_prepare(&migrate);
2572 if (!migrate.cpages)
2573 return 0;
2574
2575 /* Unmap pages */
2576 migrate_vma_unmap(&migrate);
2577 if (!migrate.cpages)
2578 return 0;
2579
2580 /*
2581 * At this point pages are locked and unmapped, and thus they have
2582 * stable content and can safely be copied to destination memory that
2583 * is allocated by the callback.
2584 *
2585 * Note that migration can fail in migrate_vma_struct_page() for each
2586 * individual page.
2587 */
2588 ops->alloc_and_copy(vma, src, dst, start, end, private);
2589
2590 /* This does the real migration of struct page */
2591 migrate_vma_pages(&migrate);
2592
2593 ops->finalize_and_map(vma, src, dst, start, end, private);
2594
2595 /* Unlock and remap pages */
2596 migrate_vma_finalize(&migrate);
2597
2598 return 0;
2599}
2600EXPORT_SYMBOL(migrate_vma);