diff options
Diffstat (limited to 'mm/migrate.c')
| -rw-r--r-- | mm/migrate.c | 234 | 
1 files changed, 216 insertions, 18 deletions
| diff --git a/mm/migrate.c b/mm/migrate.c index 38e7cad782f4..f8c9bccf2520 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -32,6 +32,7 @@ | |||
| 32 | #include <linux/security.h> | 32 | #include <linux/security.h> | 
| 33 | #include <linux/memcontrol.h> | 33 | #include <linux/memcontrol.h> | 
| 34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> | 
| 35 | #include <linux/hugetlb.h> | ||
| 35 | #include <linux/gfp.h> | 36 | #include <linux/gfp.h> | 
| 36 | 37 | ||
| 37 | #include "internal.h" | 38 | #include "internal.h" | 
| @@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
| 95 | pte_t *ptep, pte; | 96 | pte_t *ptep, pte; | 
| 96 | spinlock_t *ptl; | 97 | spinlock_t *ptl; | 
| 97 | 98 | ||
| 98 | pgd = pgd_offset(mm, addr); | 99 | if (unlikely(PageHuge(new))) { | 
| 99 | if (!pgd_present(*pgd)) | 100 | ptep = huge_pte_offset(mm, addr); | 
| 100 | goto out; | 101 | if (!ptep) | 
| 102 | goto out; | ||
| 103 | ptl = &mm->page_table_lock; | ||
| 104 | } else { | ||
| 105 | pgd = pgd_offset(mm, addr); | ||
| 106 | if (!pgd_present(*pgd)) | ||
| 107 | goto out; | ||
| 101 | 108 | ||
| 102 | pud = pud_offset(pgd, addr); | 109 | pud = pud_offset(pgd, addr); | 
| 103 | if (!pud_present(*pud)) | 110 | if (!pud_present(*pud)) | 
| 104 | goto out; | 111 | goto out; | 
| 105 | 112 | ||
| 106 | pmd = pmd_offset(pud, addr); | 113 | pmd = pmd_offset(pud, addr); | 
| 107 | if (!pmd_present(*pmd)) | 114 | if (!pmd_present(*pmd)) | 
| 108 | goto out; | 115 | goto out; | 
| 109 | 116 | ||
| 110 | ptep = pte_offset_map(pmd, addr); | 117 | ptep = pte_offset_map(pmd, addr); | 
| 111 | 118 | ||
| 112 | if (!is_swap_pte(*ptep)) { | 119 | if (!is_swap_pte(*ptep)) { | 
| 113 | pte_unmap(ptep); | 120 | pte_unmap(ptep); | 
| 114 | goto out; | 121 | goto out; | 
| 115 | } | 122 | } | 
| 123 | |||
| 124 | ptl = pte_lockptr(mm, pmd); | ||
| 125 | } | ||
| 116 | 126 | ||
| 117 | ptl = pte_lockptr(mm, pmd); | ||
| 118 | spin_lock(ptl); | 127 | spin_lock(ptl); | 
| 119 | pte = *ptep; | 128 | pte = *ptep; | 
| 120 | if (!is_swap_pte(pte)) | 129 | if (!is_swap_pte(pte)) | 
| @@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
| 130 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 139 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 
| 131 | if (is_write_migration_entry(entry)) | 140 | if (is_write_migration_entry(entry)) | 
| 132 | pte = pte_mkwrite(pte); | 141 | pte = pte_mkwrite(pte); | 
| 142 | #ifdef CONFIG_HUGETLB_PAGE | ||
| 143 | if (PageHuge(new)) | ||
| 144 | pte = pte_mkhuge(pte); | ||
| 145 | #endif | ||
| 133 | flush_cache_page(vma, addr, pte_pfn(pte)); | 146 | flush_cache_page(vma, addr, pte_pfn(pte)); | 
| 134 | set_pte_at(mm, addr, ptep, pte); | 147 | set_pte_at(mm, addr, ptep, pte); | 
| 135 | 148 | ||
| 136 | if (PageAnon(new)) | 149 | if (PageHuge(new)) { | 
| 150 | if (PageAnon(new)) | ||
| 151 | hugepage_add_anon_rmap(new, vma, addr); | ||
| 152 | else | ||
| 153 | page_dup_rmap(new); | ||
| 154 | } else if (PageAnon(new)) | ||
| 137 | page_add_anon_rmap(new, vma, addr); | 155 | page_add_anon_rmap(new, vma, addr); | 
| 138 | else | 156 | else | 
| 139 | page_add_file_rmap(new); | 157 | page_add_file_rmap(new); | 
| @@ -276,11 +294,59 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
| 276 | } | 294 | } | 
| 277 | 295 | ||
| 278 | /* | 296 | /* | 
| 297 | * The expected number of remaining references is the same as that | ||
| 298 | * of migrate_page_move_mapping(). | ||
| 299 | */ | ||
| 300 | int migrate_huge_page_move_mapping(struct address_space *mapping, | ||
| 301 | struct page *newpage, struct page *page) | ||
| 302 | { | ||
| 303 | int expected_count; | ||
| 304 | void **pslot; | ||
| 305 | |||
| 306 | if (!mapping) { | ||
| 307 | if (page_count(page) != 1) | ||
| 308 | return -EAGAIN; | ||
| 309 | return 0; | ||
| 310 | } | ||
| 311 | |||
| 312 | spin_lock_irq(&mapping->tree_lock); | ||
| 313 | |||
| 314 | pslot = radix_tree_lookup_slot(&mapping->page_tree, | ||
| 315 | page_index(page)); | ||
| 316 | |||
| 317 | expected_count = 2 + page_has_private(page); | ||
| 318 | if (page_count(page) != expected_count || | ||
| 319 | (struct page *)radix_tree_deref_slot(pslot) != page) { | ||
| 320 | spin_unlock_irq(&mapping->tree_lock); | ||
| 321 | return -EAGAIN; | ||
| 322 | } | ||
| 323 | |||
| 324 | if (!page_freeze_refs(page, expected_count)) { | ||
| 325 | spin_unlock_irq(&mapping->tree_lock); | ||
| 326 | return -EAGAIN; | ||
| 327 | } | ||
| 328 | |||
| 329 | get_page(newpage); | ||
| 330 | |||
| 331 | radix_tree_replace_slot(pslot, newpage); | ||
| 332 | |||
| 333 | page_unfreeze_refs(page, expected_count); | ||
| 334 | |||
| 335 | __put_page(page); | ||
| 336 | |||
| 337 | spin_unlock_irq(&mapping->tree_lock); | ||
| 338 | return 0; | ||
| 339 | } | ||
| 340 | |||
| 341 | /* | ||
| 279 | * Copy the page to its new location | 342 | * Copy the page to its new location | 
| 280 | */ | 343 | */ | 
| 281 | static void migrate_page_copy(struct page *newpage, struct page *page) | 344 | void migrate_page_copy(struct page *newpage, struct page *page) | 
| 282 | { | 345 | { | 
| 283 | copy_highpage(newpage, page); | 346 | if (PageHuge(page)) | 
| 347 | copy_huge_page(newpage, page); | ||
| 348 | else | ||
| 349 | copy_highpage(newpage, page); | ||
| 284 | 350 | ||
| 285 | if (PageError(page)) | 351 | if (PageError(page)) | 
| 286 | SetPageError(newpage); | 352 | SetPageError(newpage); | 
| @@ -724,6 +790,92 @@ move_newpage: | |||
| 724 | } | 790 | } | 
| 725 | 791 | ||
| 726 | /* | 792 | /* | 
| 793 | * Counterpart of unmap_and_move_page() for hugepage migration. | ||
| 794 | * | ||
| 795 | * This function doesn't wait the completion of hugepage I/O | ||
| 796 | * because there is no race between I/O and migration for hugepage. | ||
| 797 | * Note that currently hugepage I/O occurs only in direct I/O | ||
| 798 | * where no lock is held and PG_writeback is irrelevant, | ||
| 799 | * and writeback status of all subpages are counted in the reference | ||
| 800 | * count of the head page (i.e. if all subpages of a 2MB hugepage are | ||
| 801 | * under direct I/O, the reference of the head page is 512 and a bit more.) | ||
| 802 | * This means that when we try to migrate hugepage whose subpages are | ||
| 803 | * doing direct I/O, some references remain after try_to_unmap() and | ||
| 804 | * hugepage migration fails without data corruption. | ||
| 805 | * | ||
| 806 | * There is also no race when direct I/O is issued on the page under migration, | ||
| 807 | * because then pte is replaced with migration swap entry and direct I/O code | ||
| 808 | * will wait in the page fault for migration to complete. | ||
| 809 | */ | ||
| 810 | static int unmap_and_move_huge_page(new_page_t get_new_page, | ||
| 811 | unsigned long private, struct page *hpage, | ||
| 812 | int force, int offlining) | ||
| 813 | { | ||
| 814 | int rc = 0; | ||
| 815 | int *result = NULL; | ||
| 816 | struct page *new_hpage = get_new_page(hpage, private, &result); | ||
| 817 | int rcu_locked = 0; | ||
| 818 | struct anon_vma *anon_vma = NULL; | ||
| 819 | |||
| 820 | if (!new_hpage) | ||
| 821 | return -ENOMEM; | ||
| 822 | |||
| 823 | rc = -EAGAIN; | ||
| 824 | |||
| 825 | if (!trylock_page(hpage)) { | ||
| 826 | if (!force) | ||
| 827 | goto out; | ||
| 828 | lock_page(hpage); | ||
| 829 | } | ||
| 830 | |||
| 831 | if (PageAnon(hpage)) { | ||
| 832 | rcu_read_lock(); | ||
| 833 | rcu_locked = 1; | ||
| 834 | |||
| 835 | if (page_mapped(hpage)) { | ||
| 836 | anon_vma = page_anon_vma(hpage); | ||
| 837 | atomic_inc(&anon_vma->external_refcount); | ||
| 838 | } | ||
| 839 | } | ||
| 840 | |||
| 841 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | ||
| 842 | |||
| 843 | if (!page_mapped(hpage)) | ||
| 844 | rc = move_to_new_page(new_hpage, hpage, 1); | ||
| 845 | |||
| 846 | if (rc) | ||
| 847 | remove_migration_ptes(hpage, hpage); | ||
| 848 | |||
| 849 | if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, | ||
| 850 | &anon_vma->lock)) { | ||
| 851 | int empty = list_empty(&anon_vma->head); | ||
| 852 | spin_unlock(&anon_vma->lock); | ||
| 853 | if (empty) | ||
| 854 | anon_vma_free(anon_vma); | ||
| 855 | } | ||
| 856 | |||
| 857 | if (rcu_locked) | ||
| 858 | rcu_read_unlock(); | ||
| 859 | out: | ||
| 860 | unlock_page(hpage); | ||
| 861 | |||
| 862 | if (rc != -EAGAIN) { | ||
| 863 | list_del(&hpage->lru); | ||
| 864 | put_page(hpage); | ||
| 865 | } | ||
| 866 | |||
| 867 | put_page(new_hpage); | ||
| 868 | |||
| 869 | if (result) { | ||
| 870 | if (rc) | ||
| 871 | *result = rc; | ||
| 872 | else | ||
| 873 | *result = page_to_nid(new_hpage); | ||
| 874 | } | ||
| 875 | return rc; | ||
| 876 | } | ||
| 877 | |||
| 878 | /* | ||
| 727 | * migrate_pages | 879 | * migrate_pages | 
| 728 | * | 880 | * | 
| 729 | * The function takes one list of pages to migrate and a function | 881 | * The function takes one list of pages to migrate and a function | 
| @@ -788,6 +940,52 @@ out: | |||
| 788 | return nr_failed + retry; | 940 | return nr_failed + retry; | 
| 789 | } | 941 | } | 
| 790 | 942 | ||
| 943 | int migrate_huge_pages(struct list_head *from, | ||
| 944 | new_page_t get_new_page, unsigned long private, int offlining) | ||
| 945 | { | ||
| 946 | int retry = 1; | ||
| 947 | int nr_failed = 0; | ||
| 948 | int pass = 0; | ||
| 949 | struct page *page; | ||
| 950 | struct page *page2; | ||
| 951 | int rc; | ||
| 952 | |||
| 953 | for (pass = 0; pass < 10 && retry; pass++) { | ||
| 954 | retry = 0; | ||
| 955 | |||
| 956 | list_for_each_entry_safe(page, page2, from, lru) { | ||
| 957 | cond_resched(); | ||
| 958 | |||
| 959 | rc = unmap_and_move_huge_page(get_new_page, | ||
| 960 | private, page, pass > 2, offlining); | ||
| 961 | |||
| 962 | switch(rc) { | ||
| 963 | case -ENOMEM: | ||
| 964 | goto out; | ||
| 965 | case -EAGAIN: | ||
| 966 | retry++; | ||
| 967 | break; | ||
| 968 | case 0: | ||
| 969 | break; | ||
| 970 | default: | ||
| 971 | /* Permanent failure */ | ||
| 972 | nr_failed++; | ||
| 973 | break; | ||
| 974 | } | ||
| 975 | } | ||
| 976 | } | ||
| 977 | rc = 0; | ||
| 978 | out: | ||
| 979 | |||
| 980 | list_for_each_entry_safe(page, page2, from, lru) | ||
| 981 | put_page(page); | ||
| 982 | |||
| 983 | if (rc) | ||
| 984 | return rc; | ||
| 985 | |||
| 986 | return nr_failed + retry; | ||
| 987 | } | ||
| 988 | |||
| 791 | #ifdef CONFIG_NUMA | 989 | #ifdef CONFIG_NUMA | 
| 792 | /* | 990 | /* | 
| 793 | * Move a list of individual pages | 991 | * Move a list of individual pages | 
