diff options
Diffstat (limited to 'mm/migrate.c')
-rw-r--r-- | mm/migrate.c | 492 |
1 files changed, 492 insertions, 0 deletions
diff --git a/mm/migrate.c b/mm/migrate.c index 71de36cfb673..991e8886093f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -428,6 +428,14 @@ int migrate_page_move_mapping(struct address_space *mapping, | |||
428 | int expected_count = 1 + extra_count; | 428 | int expected_count = 1 + extra_count; |
429 | void **pslot; | 429 | void **pslot; |
430 | 430 | ||
431 | /* | ||
432 | * ZONE_DEVICE pages have 1 refcount always held by their device | ||
433 | * | ||
434 | * Note that DAX memory will never reach that point as it does not have | ||
435 | * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h). | ||
436 | */ | ||
437 | expected_count += is_zone_device_page(page); | ||
438 | |||
431 | if (!mapping) { | 439 | if (!mapping) { |
432 | /* Anonymous page without mapping */ | 440 | /* Anonymous page without mapping */ |
433 | if (page_count(page) != expected_count) | 441 | if (page_count(page) != expected_count) |
@@ -2106,3 +2114,487 @@ out_unlock: | |||
2106 | #endif /* CONFIG_NUMA_BALANCING */ | 2114 | #endif /* CONFIG_NUMA_BALANCING */ |
2107 | 2115 | ||
2108 | #endif /* CONFIG_NUMA */ | 2116 | #endif /* CONFIG_NUMA */ |
2117 | |||
2118 | |||
2119 | struct migrate_vma { | ||
2120 | struct vm_area_struct *vma; | ||
2121 | unsigned long *dst; | ||
2122 | unsigned long *src; | ||
2123 | unsigned long cpages; | ||
2124 | unsigned long npages; | ||
2125 | unsigned long start; | ||
2126 | unsigned long end; | ||
2127 | }; | ||
2128 | |||
2129 | static int migrate_vma_collect_hole(unsigned long start, | ||
2130 | unsigned long end, | ||
2131 | struct mm_walk *walk) | ||
2132 | { | ||
2133 | struct migrate_vma *migrate = walk->private; | ||
2134 | unsigned long addr; | ||
2135 | |||
2136 | for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { | ||
2137 | migrate->dst[migrate->npages] = 0; | ||
2138 | migrate->src[migrate->npages++] = 0; | ||
2139 | } | ||
2140 | |||
2141 | return 0; | ||
2142 | } | ||
2143 | |||
2144 | static int migrate_vma_collect_pmd(pmd_t *pmdp, | ||
2145 | unsigned long start, | ||
2146 | unsigned long end, | ||
2147 | struct mm_walk *walk) | ||
2148 | { | ||
2149 | struct migrate_vma *migrate = walk->private; | ||
2150 | struct vm_area_struct *vma = walk->vma; | ||
2151 | struct mm_struct *mm = vma->vm_mm; | ||
2152 | unsigned long addr = start; | ||
2153 | spinlock_t *ptl; | ||
2154 | pte_t *ptep; | ||
2155 | |||
2156 | again: | ||
2157 | if (pmd_none(*pmdp)) | ||
2158 | return migrate_vma_collect_hole(start, end, walk); | ||
2159 | |||
2160 | if (pmd_trans_huge(*pmdp)) { | ||
2161 | struct page *page; | ||
2162 | |||
2163 | ptl = pmd_lock(mm, pmdp); | ||
2164 | if (unlikely(!pmd_trans_huge(*pmdp))) { | ||
2165 | spin_unlock(ptl); | ||
2166 | goto again; | ||
2167 | } | ||
2168 | |||
2169 | page = pmd_page(*pmdp); | ||
2170 | if (is_huge_zero_page(page)) { | ||
2171 | spin_unlock(ptl); | ||
2172 | split_huge_pmd(vma, pmdp, addr); | ||
2173 | if (pmd_trans_unstable(pmdp)) | ||
2174 | return migrate_vma_collect_hole(start, end, | ||
2175 | walk); | ||
2176 | } else { | ||
2177 | int ret; | ||
2178 | |||
2179 | get_page(page); | ||
2180 | spin_unlock(ptl); | ||
2181 | if (unlikely(!trylock_page(page))) | ||
2182 | return migrate_vma_collect_hole(start, end, | ||
2183 | walk); | ||
2184 | ret = split_huge_page(page); | ||
2185 | unlock_page(page); | ||
2186 | put_page(page); | ||
2187 | if (ret || pmd_none(*pmdp)) | ||
2188 | return migrate_vma_collect_hole(start, end, | ||
2189 | walk); | ||
2190 | } | ||
2191 | } | ||
2192 | |||
2193 | if (unlikely(pmd_bad(*pmdp))) | ||
2194 | return migrate_vma_collect_hole(start, end, walk); | ||
2195 | |||
2196 | ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); | ||
2197 | for (; addr < end; addr += PAGE_SIZE, ptep++) { | ||
2198 | unsigned long mpfn, pfn; | ||
2199 | struct page *page; | ||
2200 | pte_t pte; | ||
2201 | |||
2202 | pte = *ptep; | ||
2203 | pfn = pte_pfn(pte); | ||
2204 | |||
2205 | if (!pte_present(pte)) { | ||
2206 | mpfn = pfn = 0; | ||
2207 | goto next; | ||
2208 | } | ||
2209 | |||
2210 | /* FIXME support THP */ | ||
2211 | page = vm_normal_page(migrate->vma, addr, pte); | ||
2212 | if (!page || !page->mapping || PageTransCompound(page)) { | ||
2213 | mpfn = pfn = 0; | ||
2214 | goto next; | ||
2215 | } | ||
2216 | |||
2217 | /* | ||
2218 | * By getting a reference on the page we pin it and that blocks | ||
2219 | * any kind of migration. Side effect is that it "freezes" the | ||
2220 | * pte. | ||
2221 | * | ||
2222 | * We drop this reference after isolating the page from the lru | ||
2223 | * for non device page (device page are not on the lru and thus | ||
2224 | * can't be dropped from it). | ||
2225 | */ | ||
2226 | get_page(page); | ||
2227 | migrate->cpages++; | ||
2228 | mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; | ||
2229 | mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; | ||
2230 | |||
2231 | next: | ||
2232 | migrate->src[migrate->npages++] = mpfn; | ||
2233 | } | ||
2234 | pte_unmap_unlock(ptep - 1, ptl); | ||
2235 | |||
2236 | return 0; | ||
2237 | } | ||
2238 | |||
2239 | /* | ||
2240 | * migrate_vma_collect() - collect pages over a range of virtual addresses | ||
2241 | * @migrate: migrate struct containing all migration information | ||
2242 | * | ||
2243 | * This will walk the CPU page table. For each virtual address backed by a | ||
2244 | * valid page, it updates the src array and takes a reference on the page, in | ||
2245 | * order to pin the page until we lock it and unmap it. | ||
2246 | */ | ||
2247 | static void migrate_vma_collect(struct migrate_vma *migrate) | ||
2248 | { | ||
2249 | struct mm_walk mm_walk; | ||
2250 | |||
2251 | mm_walk.pmd_entry = migrate_vma_collect_pmd; | ||
2252 | mm_walk.pte_entry = NULL; | ||
2253 | mm_walk.pte_hole = migrate_vma_collect_hole; | ||
2254 | mm_walk.hugetlb_entry = NULL; | ||
2255 | mm_walk.test_walk = NULL; | ||
2256 | mm_walk.vma = migrate->vma; | ||
2257 | mm_walk.mm = migrate->vma->vm_mm; | ||
2258 | mm_walk.private = migrate; | ||
2259 | |||
2260 | walk_page_range(migrate->start, migrate->end, &mm_walk); | ||
2261 | |||
2262 | migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); | ||
2263 | } | ||
2264 | |||
2265 | /* | ||
2266 | * migrate_vma_check_page() - check if page is pinned or not | ||
2267 | * @page: struct page to check | ||
2268 | * | ||
2269 | * Pinned pages cannot be migrated. This is the same test as in | ||
2270 | * migrate_page_move_mapping(), except that here we allow migration of a | ||
2271 | * ZONE_DEVICE page. | ||
2272 | */ | ||
2273 | static bool migrate_vma_check_page(struct page *page) | ||
2274 | { | ||
2275 | /* | ||
2276 | * One extra ref because caller holds an extra reference, either from | ||
2277 | * isolate_lru_page() for a regular page, or migrate_vma_collect() for | ||
2278 | * a device page. | ||
2279 | */ | ||
2280 | int extra = 1; | ||
2281 | |||
2282 | /* | ||
2283 | * FIXME support THP (transparent huge page), it is bit more complex to | ||
2284 | * check them than regular pages, because they can be mapped with a pmd | ||
2285 | * or with a pte (split pte mapping). | ||
2286 | */ | ||
2287 | if (PageCompound(page)) | ||
2288 | return false; | ||
2289 | |||
2290 | if ((page_count(page) - extra) > page_mapcount(page)) | ||
2291 | return false; | ||
2292 | |||
2293 | return true; | ||
2294 | } | ||
2295 | |||
2296 | /* | ||
2297 | * migrate_vma_prepare() - lock pages and isolate them from the lru | ||
2298 | * @migrate: migrate struct containing all migration information | ||
2299 | * | ||
2300 | * This locks pages that have been collected by migrate_vma_collect(). Once each | ||
2301 | * page is locked it is isolated from the lru (for non-device pages). Finally, | ||
2302 | * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be | ||
2303 | * migrated by concurrent kernel threads. | ||
2304 | */ | ||
2305 | static void migrate_vma_prepare(struct migrate_vma *migrate) | ||
2306 | { | ||
2307 | const unsigned long npages = migrate->npages; | ||
2308 | bool allow_drain = true; | ||
2309 | unsigned long i; | ||
2310 | |||
2311 | lru_add_drain(); | ||
2312 | |||
2313 | for (i = 0; (i < npages) && migrate->cpages; i++) { | ||
2314 | struct page *page = migrate_pfn_to_page(migrate->src[i]); | ||
2315 | |||
2316 | if (!page) | ||
2317 | continue; | ||
2318 | |||
2319 | /* | ||
2320 | * Because we are migrating several pages there can be | ||
2321 | * a deadlock between 2 concurrent migration where each | ||
2322 | * are waiting on each other page lock. | ||
2323 | * | ||
2324 | * Make migrate_vma() a best effort thing and backoff | ||
2325 | * for any page we can not lock right away. | ||
2326 | */ | ||
2327 | if (!trylock_page(page)) { | ||
2328 | migrate->src[i] = 0; | ||
2329 | migrate->cpages--; | ||
2330 | put_page(page); | ||
2331 | continue; | ||
2332 | } | ||
2333 | migrate->src[i] |= MIGRATE_PFN_LOCKED; | ||
2334 | |||
2335 | if (!PageLRU(page) && allow_drain) { | ||
2336 | /* Drain CPU's pagevec */ | ||
2337 | lru_add_drain_all(); | ||
2338 | allow_drain = false; | ||
2339 | } | ||
2340 | |||
2341 | if (isolate_lru_page(page)) { | ||
2342 | migrate->src[i] = 0; | ||
2343 | unlock_page(page); | ||
2344 | migrate->cpages--; | ||
2345 | put_page(page); | ||
2346 | continue; | ||
2347 | } | ||
2348 | |||
2349 | if (!migrate_vma_check_page(page)) { | ||
2350 | migrate->src[i] = 0; | ||
2351 | unlock_page(page); | ||
2352 | migrate->cpages--; | ||
2353 | |||
2354 | putback_lru_page(page); | ||
2355 | } | ||
2356 | } | ||
2357 | } | ||
2358 | |||
2359 | /* | ||
2360 | * migrate_vma_unmap() - replace page mapping with special migration pte entry | ||
2361 | * @migrate: migrate struct containing all migration information | ||
2362 | * | ||
2363 | * Replace page mapping (CPU page table pte) with a special migration pte entry | ||
2364 | * and check again if it has been pinned. Pinned pages are restored because we | ||
2365 | * cannot migrate them. | ||
2366 | * | ||
2367 | * This is the last step before we call the device driver callback to allocate | ||
2368 | * destination memory and copy contents of original page over to new page. | ||
2369 | */ | ||
2370 | static void migrate_vma_unmap(struct migrate_vma *migrate) | ||
2371 | { | ||
2372 | int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | ||
2373 | const unsigned long npages = migrate->npages; | ||
2374 | const unsigned long start = migrate->start; | ||
2375 | unsigned long addr, i, restore = 0; | ||
2376 | |||
2377 | for (i = 0; i < npages; i++) { | ||
2378 | struct page *page = migrate_pfn_to_page(migrate->src[i]); | ||
2379 | |||
2380 | if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE)) | ||
2381 | continue; | ||
2382 | |||
2383 | try_to_unmap(page, flags); | ||
2384 | if (page_mapped(page) || !migrate_vma_check_page(page)) { | ||
2385 | migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; | ||
2386 | migrate->cpages--; | ||
2387 | restore++; | ||
2388 | } | ||
2389 | } | ||
2390 | |||
2391 | for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { | ||
2392 | struct page *page = migrate_pfn_to_page(migrate->src[i]); | ||
2393 | |||
2394 | if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) | ||
2395 | continue; | ||
2396 | |||
2397 | remove_migration_ptes(page, page, false); | ||
2398 | |||
2399 | migrate->src[i] = 0; | ||
2400 | unlock_page(page); | ||
2401 | restore--; | ||
2402 | |||
2403 | putback_lru_page(page); | ||
2404 | } | ||
2405 | } | ||
2406 | |||
2407 | /* | ||
2408 | * migrate_vma_pages() - migrate meta-data from src page to dst page | ||
2409 | * @migrate: migrate struct containing all migration information | ||
2410 | * | ||
2411 | * This migrates struct page meta-data from source struct page to destination | ||
2412 | * struct page. This effectively finishes the migration from source page to the | ||
2413 | * destination page. | ||
2414 | */ | ||
2415 | static void migrate_vma_pages(struct migrate_vma *migrate) | ||
2416 | { | ||
2417 | const unsigned long npages = migrate->npages; | ||
2418 | const unsigned long start = migrate->start; | ||
2419 | unsigned long addr, i; | ||
2420 | |||
2421 | for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { | ||
2422 | struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); | ||
2423 | struct page *page = migrate_pfn_to_page(migrate->src[i]); | ||
2424 | struct address_space *mapping; | ||
2425 | int r; | ||
2426 | |||
2427 | if (!page || !newpage) | ||
2428 | continue; | ||
2429 | if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) | ||
2430 | continue; | ||
2431 | |||
2432 | mapping = page_mapping(page); | ||
2433 | |||
2434 | r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); | ||
2435 | if (r != MIGRATEPAGE_SUCCESS) | ||
2436 | migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; | ||
2437 | } | ||
2438 | } | ||
2439 | |||
2440 | /* | ||
2441 | * migrate_vma_finalize() - restore CPU page table entry | ||
2442 | * @migrate: migrate struct containing all migration information | ||
2443 | * | ||
2444 | * This replaces the special migration pte entry with either a mapping to the | ||
2445 | * new page if migration was successful for that page, or to the original page | ||
2446 | * otherwise. | ||
2447 | * | ||
2448 | * This also unlocks the pages and puts them back on the lru, or drops the extra | ||
2449 | * refcount, for device pages. | ||
2450 | */ | ||
2451 | static void migrate_vma_finalize(struct migrate_vma *migrate) | ||
2452 | { | ||
2453 | const unsigned long npages = migrate->npages; | ||
2454 | unsigned long i; | ||
2455 | |||
2456 | for (i = 0; i < npages; i++) { | ||
2457 | struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); | ||
2458 | struct page *page = migrate_pfn_to_page(migrate->src[i]); | ||
2459 | |||
2460 | if (!page) | ||
2461 | continue; | ||
2462 | if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { | ||
2463 | if (newpage) { | ||
2464 | unlock_page(newpage); | ||
2465 | put_page(newpage); | ||
2466 | } | ||
2467 | newpage = page; | ||
2468 | } | ||
2469 | |||
2470 | remove_migration_ptes(page, newpage, false); | ||
2471 | unlock_page(page); | ||
2472 | migrate->cpages--; | ||
2473 | |||
2474 | putback_lru_page(page); | ||
2475 | |||
2476 | if (newpage != page) { | ||
2477 | unlock_page(newpage); | ||
2478 | putback_lru_page(newpage); | ||
2479 | } | ||
2480 | } | ||
2481 | } | ||
2482 | |||
2483 | /* | ||
2484 | * migrate_vma() - migrate a range of memory inside vma | ||
2485 | * | ||
2486 | * @ops: migration callback for allocating destination memory and copying | ||
2487 | * @vma: virtual memory area containing the range to be migrated | ||
2488 | * @start: start address of the range to migrate (inclusive) | ||
2489 | * @end: end address of the range to migrate (exclusive) | ||
2490 | * @src: array of hmm_pfn_t containing source pfns | ||
2491 | * @dst: array of hmm_pfn_t containing destination pfns | ||
2492 | * @private: pointer passed back to each of the callback | ||
2493 | * Returns: 0 on success, error code otherwise | ||
2494 | * | ||
2495 | * This function tries to migrate a range of memory virtual address range, using | ||
2496 | * callbacks to allocate and copy memory from source to destination. First it | ||
2497 | * collects all the pages backing each virtual address in the range, saving this | ||
2498 | * inside the src array. Then it locks those pages and unmaps them. Once the pages | ||
2499 | * are locked and unmapped, it checks whether each page is pinned or not. Pages | ||
2500 | * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) | ||
2501 | * in the corresponding src array entry. It then restores any pages that are | ||
2502 | * pinned, by remapping and unlocking those pages. | ||
2503 | * | ||
2504 | * At this point it calls the alloc_and_copy() callback. For documentation on | ||
2505 | * what is expected from that callback, see struct migrate_vma_ops comments in | ||
2506 | * include/linux/migrate.h | ||
2507 | * | ||
2508 | * After the alloc_and_copy() callback, this function goes over each entry in | ||
2509 | * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag | ||
2510 | * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, | ||
2511 | * then the function tries to migrate struct page information from the source | ||
2512 | * struct page to the destination struct page. If it fails to migrate the struct | ||
2513 | * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src | ||
2514 | * array. | ||
2515 | * | ||
2516 | * At this point all successfully migrated pages have an entry in the src | ||
2517 | * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst | ||
2518 | * array entry with MIGRATE_PFN_VALID flag set. | ||
2519 | * | ||
2520 | * It then calls the finalize_and_map() callback. See comments for "struct | ||
2521 | * migrate_vma_ops", in include/linux/migrate.h for details about | ||
2522 | * finalize_and_map() behavior. | ||
2523 | * | ||
2524 | * After the finalize_and_map() callback, for successfully migrated pages, this | ||
2525 | * function updates the CPU page table to point to new pages, otherwise it | ||
2526 | * restores the CPU page table to point to the original source pages. | ||
2527 | * | ||
2528 | * Function returns 0 after the above steps, even if no pages were migrated | ||
2529 | * (The function only returns an error if any of the arguments are invalid.) | ||
2530 | * | ||
2531 | * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT | ||
2532 | * unsigned long entries. | ||
2533 | */ | ||
2534 | int migrate_vma(const struct migrate_vma_ops *ops, | ||
2535 | struct vm_area_struct *vma, | ||
2536 | unsigned long start, | ||
2537 | unsigned long end, | ||
2538 | unsigned long *src, | ||
2539 | unsigned long *dst, | ||
2540 | void *private) | ||
2541 | { | ||
2542 | struct migrate_vma migrate; | ||
2543 | |||
2544 | /* Sanity check the arguments */ | ||
2545 | start &= PAGE_MASK; | ||
2546 | end &= PAGE_MASK; | ||
2547 | if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) | ||
2548 | return -EINVAL; | ||
2549 | if (start < vma->vm_start || start >= vma->vm_end) | ||
2550 | return -EINVAL; | ||
2551 | if (end <= vma->vm_start || end > vma->vm_end) | ||
2552 | return -EINVAL; | ||
2553 | if (!ops || !src || !dst || start >= end) | ||
2554 | return -EINVAL; | ||
2555 | |||
2556 | memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT)); | ||
2557 | migrate.src = src; | ||
2558 | migrate.dst = dst; | ||
2559 | migrate.start = start; | ||
2560 | migrate.npages = 0; | ||
2561 | migrate.cpages = 0; | ||
2562 | migrate.end = end; | ||
2563 | migrate.vma = vma; | ||
2564 | |||
2565 | /* Collect, and try to unmap source pages */ | ||
2566 | migrate_vma_collect(&migrate); | ||
2567 | if (!migrate.cpages) | ||
2568 | return 0; | ||
2569 | |||
2570 | /* Lock and isolate page */ | ||
2571 | migrate_vma_prepare(&migrate); | ||
2572 | if (!migrate.cpages) | ||
2573 | return 0; | ||
2574 | |||
2575 | /* Unmap pages */ | ||
2576 | migrate_vma_unmap(&migrate); | ||
2577 | if (!migrate.cpages) | ||
2578 | return 0; | ||
2579 | |||
2580 | /* | ||
2581 | * At this point pages are locked and unmapped, and thus they have | ||
2582 | * stable content and can safely be copied to destination memory that | ||
2583 | * is allocated by the callback. | ||
2584 | * | ||
2585 | * Note that migration can fail in migrate_vma_struct_page() for each | ||
2586 | * individual page. | ||
2587 | */ | ||
2588 | ops->alloc_and_copy(vma, src, dst, start, end, private); | ||
2589 | |||
2590 | /* This does the real migration of struct page */ | ||
2591 | migrate_vma_pages(&migrate); | ||
2592 | |||
2593 | ops->finalize_and_map(vma, src, dst, start, end, private); | ||
2594 | |||
2595 | /* Unlock and remap pages */ | ||
2596 | migrate_vma_finalize(&migrate); | ||
2597 | |||
2598 | return 0; | ||
2599 | } | ||
2600 | EXPORT_SYMBOL(migrate_vma); | ||