diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /mm/migrate.c | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'mm/migrate.c')
-rw-r--r-- | mm/migrate.c | 378 |
1 files changed, 294 insertions, 84 deletions
diff --git a/mm/migrate.c b/mm/migrate.c index 38e7cad782f4..666e4e677414 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -32,8 +32,11 @@ | |||
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include <linux/memcontrol.h> | 33 | #include <linux/memcontrol.h> |
34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
35 | #include <linux/hugetlb.h> | ||
35 | #include <linux/gfp.h> | 36 | #include <linux/gfp.h> |
36 | 37 | ||
38 | #include <asm/tlbflush.h> | ||
39 | |||
37 | #include "internal.h" | 40 | #include "internal.h" |
38 | 41 | ||
39 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 42 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
@@ -95,26 +98,36 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
95 | pte_t *ptep, pte; | 98 | pte_t *ptep, pte; |
96 | spinlock_t *ptl; | 99 | spinlock_t *ptl; |
97 | 100 | ||
98 | pgd = pgd_offset(mm, addr); | 101 | if (unlikely(PageHuge(new))) { |
99 | if (!pgd_present(*pgd)) | 102 | ptep = huge_pte_offset(mm, addr); |
100 | goto out; | 103 | if (!ptep) |
104 | goto out; | ||
105 | ptl = &mm->page_table_lock; | ||
106 | } else { | ||
107 | pgd = pgd_offset(mm, addr); | ||
108 | if (!pgd_present(*pgd)) | ||
109 | goto out; | ||
101 | 110 | ||
102 | pud = pud_offset(pgd, addr); | 111 | pud = pud_offset(pgd, addr); |
103 | if (!pud_present(*pud)) | 112 | if (!pud_present(*pud)) |
104 | goto out; | 113 | goto out; |
105 | 114 | ||
106 | pmd = pmd_offset(pud, addr); | 115 | pmd = pmd_offset(pud, addr); |
107 | if (!pmd_present(*pmd)) | 116 | if (pmd_trans_huge(*pmd)) |
108 | goto out; | 117 | goto out; |
118 | if (!pmd_present(*pmd)) | ||
119 | goto out; | ||
109 | 120 | ||
110 | ptep = pte_offset_map(pmd, addr); | 121 | ptep = pte_offset_map(pmd, addr); |
111 | 122 | ||
112 | if (!is_swap_pte(*ptep)) { | 123 | if (!is_swap_pte(*ptep)) { |
113 | pte_unmap(ptep); | 124 | pte_unmap(ptep); |
114 | goto out; | 125 | goto out; |
115 | } | 126 | } |
127 | |||
128 | ptl = pte_lockptr(mm, pmd); | ||
129 | } | ||
116 | 130 | ||
117 | ptl = pte_lockptr(mm, pmd); | ||
118 | spin_lock(ptl); | 131 | spin_lock(ptl); |
119 | pte = *ptep; | 132 | pte = *ptep; |
120 | if (!is_swap_pte(pte)) | 133 | if (!is_swap_pte(pte)) |
@@ -130,10 +143,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
130 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 143 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
131 | if (is_write_migration_entry(entry)) | 144 | if (is_write_migration_entry(entry)) |
132 | pte = pte_mkwrite(pte); | 145 | pte = pte_mkwrite(pte); |
146 | #ifdef CONFIG_HUGETLB_PAGE | ||
147 | if (PageHuge(new)) | ||
148 | pte = pte_mkhuge(pte); | ||
149 | #endif | ||
133 | flush_cache_page(vma, addr, pte_pfn(pte)); | 150 | flush_cache_page(vma, addr, pte_pfn(pte)); |
134 | set_pte_at(mm, addr, ptep, pte); | 151 | set_pte_at(mm, addr, ptep, pte); |
135 | 152 | ||
136 | if (PageAnon(new)) | 153 | if (PageHuge(new)) { |
154 | if (PageAnon(new)) | ||
155 | hugepage_add_anon_rmap(new, vma, addr); | ||
156 | else | ||
157 | page_dup_rmap(new); | ||
158 | } else if (PageAnon(new)) | ||
137 | page_add_anon_rmap(new, vma, addr); | 159 | page_add_anon_rmap(new, vma, addr); |
138 | else | 160 | else |
139 | page_add_file_rmap(new); | 161 | page_add_file_rmap(new); |
@@ -226,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
226 | 248 | ||
227 | expected_count = 2 + page_has_private(page); | 249 | expected_count = 2 + page_has_private(page); |
228 | if (page_count(page) != expected_count || | 250 | if (page_count(page) != expected_count || |
229 | (struct page *)radix_tree_deref_slot(pslot) != page) { | 251 | radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { |
230 | spin_unlock_irq(&mapping->tree_lock); | 252 | spin_unlock_irq(&mapping->tree_lock); |
231 | return -EAGAIN; | 253 | return -EAGAIN; |
232 | } | 254 | } |
@@ -266,7 +288,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
266 | */ | 288 | */ |
267 | __dec_zone_page_state(page, NR_FILE_PAGES); | 289 | __dec_zone_page_state(page, NR_FILE_PAGES); |
268 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 290 | __inc_zone_page_state(newpage, NR_FILE_PAGES); |
269 | if (PageSwapBacked(page)) { | 291 | if (!PageSwapCache(page) && PageSwapBacked(page)) { |
270 | __dec_zone_page_state(page, NR_SHMEM); | 292 | __dec_zone_page_state(page, NR_SHMEM); |
271 | __inc_zone_page_state(newpage, NR_SHMEM); | 293 | __inc_zone_page_state(newpage, NR_SHMEM); |
272 | } | 294 | } |
@@ -276,11 +298,59 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
276 | } | 298 | } |
277 | 299 | ||
278 | /* | 300 | /* |
301 | * The expected number of remaining references is the same as that | ||
302 | * of migrate_page_move_mapping(). | ||
303 | */ | ||
304 | int migrate_huge_page_move_mapping(struct address_space *mapping, | ||
305 | struct page *newpage, struct page *page) | ||
306 | { | ||
307 | int expected_count; | ||
308 | void **pslot; | ||
309 | |||
310 | if (!mapping) { | ||
311 | if (page_count(page) != 1) | ||
312 | return -EAGAIN; | ||
313 | return 0; | ||
314 | } | ||
315 | |||
316 | spin_lock_irq(&mapping->tree_lock); | ||
317 | |||
318 | pslot = radix_tree_lookup_slot(&mapping->page_tree, | ||
319 | page_index(page)); | ||
320 | |||
321 | expected_count = 2 + page_has_private(page); | ||
322 | if (page_count(page) != expected_count || | ||
323 | radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { | ||
324 | spin_unlock_irq(&mapping->tree_lock); | ||
325 | return -EAGAIN; | ||
326 | } | ||
327 | |||
328 | if (!page_freeze_refs(page, expected_count)) { | ||
329 | spin_unlock_irq(&mapping->tree_lock); | ||
330 | return -EAGAIN; | ||
331 | } | ||
332 | |||
333 | get_page(newpage); | ||
334 | |||
335 | radix_tree_replace_slot(pslot, newpage); | ||
336 | |||
337 | page_unfreeze_refs(page, expected_count); | ||
338 | |||
339 | __put_page(page); | ||
340 | |||
341 | spin_unlock_irq(&mapping->tree_lock); | ||
342 | return 0; | ||
343 | } | ||
344 | |||
345 | /* | ||
279 | * Copy the page to its new location | 346 | * Copy the page to its new location |
280 | */ | 347 | */ |
281 | static void migrate_page_copy(struct page *newpage, struct page *page) | 348 | void migrate_page_copy(struct page *newpage, struct page *page) |
282 | { | 349 | { |
283 | copy_highpage(newpage, page); | 350 | if (PageHuge(page)) |
351 | copy_huge_page(newpage, page); | ||
352 | else | ||
353 | copy_highpage(newpage, page); | ||
284 | 354 | ||
285 | if (PageError(page)) | 355 | if (PageError(page)) |
286 | SetPageError(newpage); | 356 | SetPageError(newpage); |
@@ -305,7 +375,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
305 | * redo the accounting that clear_page_dirty_for_io undid, | 375 | * redo the accounting that clear_page_dirty_for_io undid, |
306 | * but we can't use set_page_dirty because that function | 376 | * but we can't use set_page_dirty because that function |
307 | * is actually a signal that all of the page has become dirty. | 377 | * is actually a signal that all of the page has become dirty. |
308 | * Wheras only part of our page may be dirty. | 378 | * Whereas only part of our page may be dirty. |
309 | */ | 379 | */ |
310 | __set_page_dirty_nobuffers(newpage); | 380 | __set_page_dirty_nobuffers(newpage); |
311 | } | 381 | } |
@@ -431,7 +501,6 @@ static int writeout(struct address_space *mapping, struct page *page) | |||
431 | .nr_to_write = 1, | 501 | .nr_to_write = 1, |
432 | .range_start = 0, | 502 | .range_start = 0, |
433 | .range_end = LLONG_MAX, | 503 | .range_end = LLONG_MAX, |
434 | .nonblocking = 1, | ||
435 | .for_reclaim = 1 | 504 | .for_reclaim = 1 |
436 | }; | 505 | }; |
437 | int rc; | 506 | int rc; |
@@ -495,7 +564,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
495 | * == 0 - success | 564 | * == 0 - success |
496 | */ | 565 | */ |
497 | static int move_to_new_page(struct page *newpage, struct page *page, | 566 | static int move_to_new_page(struct page *newpage, struct page *page, |
498 | int remap_swapcache) | 567 | int remap_swapcache, bool sync) |
499 | { | 568 | { |
500 | struct address_space *mapping; | 569 | struct address_space *mapping; |
501 | int rc; | 570 | int rc; |
@@ -517,18 +586,28 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
517 | mapping = page_mapping(page); | 586 | mapping = page_mapping(page); |
518 | if (!mapping) | 587 | if (!mapping) |
519 | rc = migrate_page(mapping, newpage, page); | 588 | rc = migrate_page(mapping, newpage, page); |
520 | else if (mapping->a_ops->migratepage) | 589 | else { |
521 | /* | 590 | /* |
522 | * Most pages have a mapping and most filesystems | 591 | * Do not writeback pages if !sync and migratepage is |
523 | * should provide a migration function. Anonymous | 592 | * not pointing to migrate_page() which is nonblocking |
524 | * pages are part of swap space which also has its | 593 | * (swapcache/tmpfs uses migratepage = migrate_page). |
525 | * own migration function. This is the most common | ||
526 | * path for page migration. | ||
527 | */ | 594 | */ |
528 | rc = mapping->a_ops->migratepage(mapping, | 595 | if (PageDirty(page) && !sync && |
529 | newpage, page); | 596 | mapping->a_ops->migratepage != migrate_page) |
530 | else | 597 | rc = -EBUSY; |
531 | rc = fallback_migrate_page(mapping, newpage, page); | 598 | else if (mapping->a_ops->migratepage) |
599 | /* | ||
600 | * Most pages have a mapping and most filesystems | ||
601 | * should provide a migration function. Anonymous | ||
602 | * pages are part of swap space which also has its | ||
603 | * own migration function. This is the most common | ||
604 | * path for page migration. | ||
605 | */ | ||
606 | rc = mapping->a_ops->migratepage(mapping, | ||
607 | newpage, page); | ||
608 | else | ||
609 | rc = fallback_migrate_page(mapping, newpage, page); | ||
610 | } | ||
532 | 611 | ||
533 | if (rc) { | 612 | if (rc) { |
534 | newpage->mapping = NULL; | 613 | newpage->mapping = NULL; |
@@ -547,15 +626,14 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
547 | * to the newly allocated page in newpage. | 626 | * to the newly allocated page in newpage. |
548 | */ | 627 | */ |
549 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 628 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, |
550 | struct page *page, int force, int offlining) | 629 | struct page *page, int force, bool offlining, bool sync) |
551 | { | 630 | { |
552 | int rc = 0; | 631 | int rc = 0; |
553 | int *result = NULL; | 632 | int *result = NULL; |
554 | struct page *newpage = get_new_page(page, private, &result); | 633 | struct page *newpage = get_new_page(page, private, &result); |
555 | int remap_swapcache = 1; | 634 | int remap_swapcache = 1; |
556 | int rcu_locked = 0; | ||
557 | int charge = 0; | 635 | int charge = 0; |
558 | struct mem_cgroup *mem = NULL; | 636 | struct mem_cgroup *mem; |
559 | struct anon_vma *anon_vma = NULL; | 637 | struct anon_vma *anon_vma = NULL; |
560 | 638 | ||
561 | if (!newpage) | 639 | if (!newpage) |
@@ -565,13 +643,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
565 | /* page was freed from under us. So we are done. */ | 643 | /* page was freed from under us. So we are done. */ |
566 | goto move_newpage; | 644 | goto move_newpage; |
567 | } | 645 | } |
646 | if (unlikely(PageTransHuge(page))) | ||
647 | if (unlikely(split_huge_page(page))) | ||
648 | goto move_newpage; | ||
568 | 649 | ||
569 | /* prepare cgroup just returns 0 or -ENOMEM */ | 650 | /* prepare cgroup just returns 0 or -ENOMEM */ |
570 | rc = -EAGAIN; | 651 | rc = -EAGAIN; |
571 | 652 | ||
572 | if (!trylock_page(page)) { | 653 | if (!trylock_page(page)) { |
573 | if (!force) | 654 | if (!force || !sync) |
655 | goto move_newpage; | ||
656 | |||
657 | /* | ||
658 | * It's not safe for direct compaction to call lock_page. | ||
659 | * For example, during page readahead pages are added locked | ||
660 | * to the LRU. Later, when the IO completes the pages are | ||
661 | * marked uptodate and unlocked. However, the queueing | ||
662 | * could be merging multiple pages for one bio (e.g. | ||
663 | * mpage_readpages). If an allocation happens for the | ||
664 | * second or third page, the process can end up locking | ||
665 | * the same page twice and deadlocking. Rather than | ||
666 | * trying to be clever about what pages can be locked, | ||
667 | * avoid the use of lock_page for direct compaction | ||
668 | * altogether. | ||
669 | */ | ||
670 | if (current->flags & PF_MEMALLOC) | ||
574 | goto move_newpage; | 671 | goto move_newpage; |
672 | |||
575 | lock_page(page); | 673 | lock_page(page); |
576 | } | 674 | } |
577 | 675 | ||
@@ -590,7 +688,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
590 | } | 688 | } |
591 | 689 | ||
592 | /* charge against new page */ | 690 | /* charge against new page */ |
593 | charge = mem_cgroup_prepare_migration(page, newpage, &mem); | 691 | charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); |
594 | if (charge == -ENOMEM) { | 692 | if (charge == -ENOMEM) { |
595 | rc = -ENOMEM; | 693 | rc = -ENOMEM; |
596 | goto unlock; | 694 | goto unlock; |
@@ -598,6 +696,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
598 | BUG_ON(charge); | 696 | BUG_ON(charge); |
599 | 697 | ||
600 | if (PageWriteback(page)) { | 698 | if (PageWriteback(page)) { |
699 | /* | ||
700 | * For !sync, there is no point retrying as the retry loop | ||
701 | * is expected to be too short for PageWriteback to be cleared | ||
702 | */ | ||
703 | if (!sync) { | ||
704 | rc = -EBUSY; | ||
705 | goto uncharge; | ||
706 | } | ||
601 | if (!force) | 707 | if (!force) |
602 | goto uncharge; | 708 | goto uncharge; |
603 | wait_on_page_writeback(page); | 709 | wait_on_page_writeback(page); |
@@ -605,20 +711,22 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
605 | /* | 711 | /* |
606 | * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, | 712 | * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, |
607 | * we cannot notice that anon_vma is freed while we migrates a page. | 713 | * we cannot notice that anon_vma is freed while we migrates a page. |
608 | * This rcu_read_lock() delays freeing anon_vma pointer until the end | 714 | * This get_anon_vma() delays freeing anon_vma pointer until the end |
609 | * of migration. File cache pages are no problem because of page_lock() | 715 | * of migration. File cache pages are no problem because of page_lock() |
610 | * File Caches may use write_page() or lock_page() in migration, then, | 716 | * File Caches may use write_page() or lock_page() in migration, then, |
611 | * just care Anon page here. | 717 | * just care Anon page here. |
612 | */ | 718 | */ |
613 | if (PageAnon(page)) { | 719 | if (PageAnon(page)) { |
614 | rcu_read_lock(); | 720 | /* |
615 | rcu_locked = 1; | 721 | * Only page_lock_anon_vma() understands the subtleties of |
616 | 722 | * getting a hold on an anon_vma from outside one of its mms. | |
617 | /* Determine how to safely use anon_vma */ | 723 | */ |
618 | if (!page_mapped(page)) { | 724 | anon_vma = page_get_anon_vma(page); |
619 | if (!PageSwapCache(page)) | 725 | if (anon_vma) { |
620 | goto rcu_unlock; | 726 | /* |
621 | 727 | * Anon page | |
728 | */ | ||
729 | } else if (PageSwapCache(page)) { | ||
622 | /* | 730 | /* |
623 | * We cannot be sure that the anon_vma of an unmapped | 731 | * We cannot be sure that the anon_vma of an unmapped |
624 | * swapcache page is safe to use because we don't | 732 | * swapcache page is safe to use because we don't |
@@ -633,13 +741,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
633 | */ | 741 | */ |
634 | remap_swapcache = 0; | 742 | remap_swapcache = 0; |
635 | } else { | 743 | } else { |
636 | /* | 744 | goto uncharge; |
637 | * Take a reference count on the anon_vma if the | ||
638 | * page is mapped so that it is guaranteed to | ||
639 | * exist when the page is remapped later | ||
640 | */ | ||
641 | anon_vma = page_anon_vma(page); | ||
642 | get_anon_vma(anon_vma); | ||
643 | } | 745 | } |
644 | } | 746 | } |
645 | 747 | ||
@@ -656,16 +758,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
656 | * free the metadata, so the page can be freed. | 758 | * free the metadata, so the page can be freed. |
657 | */ | 759 | */ |
658 | if (!page->mapping) { | 760 | if (!page->mapping) { |
659 | if (!PageAnon(page) && page_has_private(page)) { | 761 | VM_BUG_ON(PageAnon(page)); |
660 | /* | 762 | if (page_has_private(page)) { |
661 | * Go direct to try_to_free_buffers() here because | ||
662 | * a) that's what try_to_release_page() would do anyway | ||
663 | * b) we may be under rcu_read_lock() here, so we can't | ||
664 | * use GFP_KERNEL which is what try_to_release_page() | ||
665 | * needs to be effective. | ||
666 | */ | ||
667 | try_to_free_buffers(page); | 763 | try_to_free_buffers(page); |
668 | goto rcu_unlock; | 764 | goto uncharge; |
669 | } | 765 | } |
670 | goto skip_unmap; | 766 | goto skip_unmap; |
671 | } | 767 | } |
@@ -675,24 +771,22 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
675 | 771 | ||
676 | skip_unmap: | 772 | skip_unmap: |
677 | if (!page_mapped(page)) | 773 | if (!page_mapped(page)) |
678 | rc = move_to_new_page(newpage, page, remap_swapcache); | 774 | rc = move_to_new_page(newpage, page, remap_swapcache, sync); |
679 | 775 | ||
680 | if (rc && remap_swapcache) | 776 | if (rc && remap_swapcache) |
681 | remove_migration_ptes(page, page); | 777 | remove_migration_ptes(page, page); |
682 | rcu_unlock: | ||
683 | 778 | ||
684 | /* Drop an anon_vma reference if we took one */ | 779 | /* Drop an anon_vma reference if we took one */ |
685 | if (anon_vma) | 780 | if (anon_vma) |
686 | drop_anon_vma(anon_vma); | 781 | put_anon_vma(anon_vma); |
687 | 782 | ||
688 | if (rcu_locked) | ||
689 | rcu_read_unlock(); | ||
690 | uncharge: | 783 | uncharge: |
691 | if (!charge) | 784 | if (!charge) |
692 | mem_cgroup_end_migration(mem, page, newpage); | 785 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); |
693 | unlock: | 786 | unlock: |
694 | unlock_page(page); | 787 | unlock_page(page); |
695 | 788 | ||
789 | move_newpage: | ||
696 | if (rc != -EAGAIN) { | 790 | if (rc != -EAGAIN) { |
697 | /* | 791 | /* |
698 | * A page that has been migrated has all references | 792 | * A page that has been migrated has all references |
@@ -706,8 +800,6 @@ unlock: | |||
706 | putback_lru_page(page); | 800 | putback_lru_page(page); |
707 | } | 801 | } |
708 | 802 | ||
709 | move_newpage: | ||
710 | |||
711 | /* | 803 | /* |
712 | * Move the new page to the LRU. If migration was not successful | 804 | * Move the new page to the LRU. If migration was not successful |
713 | * then this will free the page. | 805 | * then this will free the page. |
@@ -724,6 +816,76 @@ move_newpage: | |||
724 | } | 816 | } |
725 | 817 | ||
726 | /* | 818 | /* |
819 | * Counterpart of unmap_and_move_page() for hugepage migration. | ||
820 | * | ||
821 | * This function doesn't wait the completion of hugepage I/O | ||
822 | * because there is no race between I/O and migration for hugepage. | ||
823 | * Note that currently hugepage I/O occurs only in direct I/O | ||
824 | * where no lock is held and PG_writeback is irrelevant, | ||
825 | * and writeback status of all subpages are counted in the reference | ||
826 | * count of the head page (i.e. if all subpages of a 2MB hugepage are | ||
827 | * under direct I/O, the reference of the head page is 512 and a bit more.) | ||
828 | * This means that when we try to migrate hugepage whose subpages are | ||
829 | * doing direct I/O, some references remain after try_to_unmap() and | ||
830 | * hugepage migration fails without data corruption. | ||
831 | * | ||
832 | * There is also no race when direct I/O is issued on the page under migration, | ||
833 | * because then pte is replaced with migration swap entry and direct I/O code | ||
834 | * will wait in the page fault for migration to complete. | ||
835 | */ | ||
836 | static int unmap_and_move_huge_page(new_page_t get_new_page, | ||
837 | unsigned long private, struct page *hpage, | ||
838 | int force, bool offlining, bool sync) | ||
839 | { | ||
840 | int rc = 0; | ||
841 | int *result = NULL; | ||
842 | struct page *new_hpage = get_new_page(hpage, private, &result); | ||
843 | struct anon_vma *anon_vma = NULL; | ||
844 | |||
845 | if (!new_hpage) | ||
846 | return -ENOMEM; | ||
847 | |||
848 | rc = -EAGAIN; | ||
849 | |||
850 | if (!trylock_page(hpage)) { | ||
851 | if (!force || !sync) | ||
852 | goto out; | ||
853 | lock_page(hpage); | ||
854 | } | ||
855 | |||
856 | if (PageAnon(hpage)) | ||
857 | anon_vma = page_get_anon_vma(hpage); | ||
858 | |||
859 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | ||
860 | |||
861 | if (!page_mapped(hpage)) | ||
862 | rc = move_to_new_page(new_hpage, hpage, 1, sync); | ||
863 | |||
864 | if (rc) | ||
865 | remove_migration_ptes(hpage, hpage); | ||
866 | |||
867 | if (anon_vma) | ||
868 | put_anon_vma(anon_vma); | ||
869 | out: | ||
870 | unlock_page(hpage); | ||
871 | |||
872 | if (rc != -EAGAIN) { | ||
873 | list_del(&hpage->lru); | ||
874 | put_page(hpage); | ||
875 | } | ||
876 | |||
877 | put_page(new_hpage); | ||
878 | |||
879 | if (result) { | ||
880 | if (rc) | ||
881 | *result = rc; | ||
882 | else | ||
883 | *result = page_to_nid(new_hpage); | ||
884 | } | ||
885 | return rc; | ||
886 | } | ||
887 | |||
888 | /* | ||
727 | * migrate_pages | 889 | * migrate_pages |
728 | * | 890 | * |
729 | * The function takes one list of pages to migrate and a function | 891 | * The function takes one list of pages to migrate and a function |
@@ -732,13 +894,15 @@ move_newpage: | |||
732 | * | 894 | * |
733 | * The function returns after 10 attempts or if no pages | 895 | * The function returns after 10 attempts or if no pages |
734 | * are movable anymore because to has become empty | 896 | * are movable anymore because to has become empty |
735 | * or no retryable pages exist anymore. All pages will be | 897 | * or no retryable pages exist anymore. |
736 | * returned to the LRU or freed. | 898 | * Caller should call putback_lru_pages to return pages to the LRU |
899 | * or free list only if ret != 0. | ||
737 | * | 900 | * |
738 | * Return: Number of pages not migrated or error code. | 901 | * Return: Number of pages not migrated or error code. |
739 | */ | 902 | */ |
740 | int migrate_pages(struct list_head *from, | 903 | int migrate_pages(struct list_head *from, |
741 | new_page_t get_new_page, unsigned long private, int offlining) | 904 | new_page_t get_new_page, unsigned long private, bool offlining, |
905 | bool sync) | ||
742 | { | 906 | { |
743 | int retry = 1; | 907 | int retry = 1; |
744 | int nr_failed = 0; | 908 | int nr_failed = 0; |
@@ -758,7 +922,8 @@ int migrate_pages(struct list_head *from, | |||
758 | cond_resched(); | 922 | cond_resched(); |
759 | 923 | ||
760 | rc = unmap_and_move(get_new_page, private, | 924 | rc = unmap_and_move(get_new_page, private, |
761 | page, pass > 2, offlining); | 925 | page, pass > 2, offlining, |
926 | sync); | ||
762 | 927 | ||
763 | switch(rc) { | 928 | switch(rc) { |
764 | case -ENOMEM: | 929 | case -ENOMEM: |
@@ -780,8 +945,50 @@ out: | |||
780 | if (!swapwrite) | 945 | if (!swapwrite) |
781 | current->flags &= ~PF_SWAPWRITE; | 946 | current->flags &= ~PF_SWAPWRITE; |
782 | 947 | ||
783 | putback_lru_pages(from); | 948 | if (rc) |
949 | return rc; | ||
950 | |||
951 | return nr_failed + retry; | ||
952 | } | ||
953 | |||
954 | int migrate_huge_pages(struct list_head *from, | ||
955 | new_page_t get_new_page, unsigned long private, bool offlining, | ||
956 | bool sync) | ||
957 | { | ||
958 | int retry = 1; | ||
959 | int nr_failed = 0; | ||
960 | int pass = 0; | ||
961 | struct page *page; | ||
962 | struct page *page2; | ||
963 | int rc; | ||
964 | |||
965 | for (pass = 0; pass < 10 && retry; pass++) { | ||
966 | retry = 0; | ||
967 | |||
968 | list_for_each_entry_safe(page, page2, from, lru) { | ||
969 | cond_resched(); | ||
970 | |||
971 | rc = unmap_and_move_huge_page(get_new_page, | ||
972 | private, page, pass > 2, offlining, | ||
973 | sync); | ||
784 | 974 | ||
975 | switch(rc) { | ||
976 | case -ENOMEM: | ||
977 | goto out; | ||
978 | case -EAGAIN: | ||
979 | retry++; | ||
980 | break; | ||
981 | case 0: | ||
982 | break; | ||
983 | default: | ||
984 | /* Permanent failure */ | ||
985 | nr_failed++; | ||
986 | break; | ||
987 | } | ||
988 | } | ||
989 | } | ||
990 | rc = 0; | ||
991 | out: | ||
785 | if (rc) | 992 | if (rc) |
786 | return rc; | 993 | return rc; |
787 | 994 | ||
@@ -841,10 +1048,10 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
841 | 1048 | ||
842 | err = -EFAULT; | 1049 | err = -EFAULT; |
843 | vma = find_vma(mm, pp->addr); | 1050 | vma = find_vma(mm, pp->addr); |
844 | if (!vma || !vma_migratable(vma)) | 1051 | if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) |
845 | goto set_status; | 1052 | goto set_status; |
846 | 1053 | ||
847 | page = follow_page(vma, pp->addr, FOLL_GET); | 1054 | page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); |
848 | 1055 | ||
849 | err = PTR_ERR(page); | 1056 | err = PTR_ERR(page); |
850 | if (IS_ERR(page)) | 1057 | if (IS_ERR(page)) |
@@ -890,9 +1097,12 @@ set_status: | |||
890 | } | 1097 | } |
891 | 1098 | ||
892 | err = 0; | 1099 | err = 0; |
893 | if (!list_empty(&pagelist)) | 1100 | if (!list_empty(&pagelist)) { |
894 | err = migrate_pages(&pagelist, new_page_node, | 1101 | err = migrate_pages(&pagelist, new_page_node, |
895 | (unsigned long)pm, 0); | 1102 | (unsigned long)pm, 0, true); |
1103 | if (err) | ||
1104 | putback_lru_pages(&pagelist); | ||
1105 | } | ||
896 | 1106 | ||
897 | up_read(&mm->mmap_sem); | 1107 | up_read(&mm->mmap_sem); |
898 | return err; | 1108 | return err; |
@@ -1005,7 +1215,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, | |||
1005 | int err = -EFAULT; | 1215 | int err = -EFAULT; |
1006 | 1216 | ||
1007 | vma = find_vma(mm, addr); | 1217 | vma = find_vma(mm, addr); |
1008 | if (!vma) | 1218 | if (!vma || addr < vma->vm_start) |
1009 | goto set_status; | 1219 | goto set_status; |
1010 | 1220 | ||
1011 | page = follow_page(vma, addr, 0); | 1221 | page = follow_page(vma, addr, 0); |
@@ -1086,14 +1296,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1086 | return -EPERM; | 1296 | return -EPERM; |
1087 | 1297 | ||
1088 | /* Find the mm_struct */ | 1298 | /* Find the mm_struct */ |
1089 | read_lock(&tasklist_lock); | 1299 | rcu_read_lock(); |
1090 | task = pid ? find_task_by_vpid(pid) : current; | 1300 | task = pid ? find_task_by_vpid(pid) : current; |
1091 | if (!task) { | 1301 | if (!task) { |
1092 | read_unlock(&tasklist_lock); | 1302 | rcu_read_unlock(); |
1093 | return -ESRCH; | 1303 | return -ESRCH; |
1094 | } | 1304 | } |
1095 | mm = get_task_mm(task); | 1305 | mm = get_task_mm(task); |
1096 | read_unlock(&tasklist_lock); | 1306 | rcu_read_unlock(); |
1097 | 1307 | ||
1098 | if (!mm) | 1308 | if (!mm) |
1099 | return -EINVAL; | 1309 | return -EINVAL; |