diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/hugetlb.c | 233 | ||||
| -rw-r--r-- | mm/memory-failure.c | 102 | ||||
| -rw-r--r-- | mm/memory.c | 3 | ||||
| -rw-r--r-- | mm/migrate.c | 234 | ||||
| -rw-r--r-- | mm/rmap.c | 25 |
5 files changed, 482 insertions, 115 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c03273807182..96991ded82fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page, | |||
| 423 | } | 423 | } |
| 424 | } | 424 | } |
| 425 | 425 | ||
| 426 | static void copy_gigantic_page(struct page *dst, struct page *src, | 426 | static void copy_user_gigantic_page(struct page *dst, struct page *src, |
| 427 | unsigned long addr, struct vm_area_struct *vma) | 427 | unsigned long addr, struct vm_area_struct *vma) |
| 428 | { | 428 | { |
| 429 | int i; | 429 | int i; |
| 430 | struct hstate *h = hstate_vma(vma); | 430 | struct hstate *h = hstate_vma(vma); |
| 431 | struct page *dst_base = dst; | 431 | struct page *dst_base = dst; |
| 432 | struct page *src_base = src; | 432 | struct page *src_base = src; |
| 433 | might_sleep(); | 433 | |
| 434 | for (i = 0; i < pages_per_huge_page(h); ) { | 434 | for (i = 0; i < pages_per_huge_page(h); ) { |
| 435 | cond_resched(); | 435 | cond_resched(); |
| 436 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | 436 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); |
| @@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src, | |||
| 440 | src = mem_map_next(src, src_base, i); | 440 | src = mem_map_next(src, src_base, i); |
| 441 | } | 441 | } |
| 442 | } | 442 | } |
| 443 | static void copy_huge_page(struct page *dst, struct page *src, | 443 | |
| 444 | static void copy_user_huge_page(struct page *dst, struct page *src, | ||
| 444 | unsigned long addr, struct vm_area_struct *vma) | 445 | unsigned long addr, struct vm_area_struct *vma) |
| 445 | { | 446 | { |
| 446 | int i; | 447 | int i; |
| 447 | struct hstate *h = hstate_vma(vma); | 448 | struct hstate *h = hstate_vma(vma); |
| 448 | 449 | ||
| 449 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { | 450 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { |
| 450 | copy_gigantic_page(dst, src, addr, vma); | 451 | copy_user_gigantic_page(dst, src, addr, vma); |
| 451 | return; | 452 | return; |
| 452 | } | 453 | } |
| 453 | 454 | ||
| @@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src, | |||
| 458 | } | 459 | } |
| 459 | } | 460 | } |
| 460 | 461 | ||
| 462 | static void copy_gigantic_page(struct page *dst, struct page *src) | ||
| 463 | { | ||
| 464 | int i; | ||
| 465 | struct hstate *h = page_hstate(src); | ||
| 466 | struct page *dst_base = dst; | ||
| 467 | struct page *src_base = src; | ||
| 468 | |||
| 469 | for (i = 0; i < pages_per_huge_page(h); ) { | ||
| 470 | cond_resched(); | ||
| 471 | copy_highpage(dst, src); | ||
| 472 | |||
| 473 | i++; | ||
| 474 | dst = mem_map_next(dst, dst_base, i); | ||
| 475 | src = mem_map_next(src, src_base, i); | ||
| 476 | } | ||
| 477 | } | ||
| 478 | |||
| 479 | void copy_huge_page(struct page *dst, struct page *src) | ||
| 480 | { | ||
| 481 | int i; | ||
| 482 | struct hstate *h = page_hstate(src); | ||
| 483 | |||
| 484 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { | ||
| 485 | copy_gigantic_page(dst, src); | ||
| 486 | return; | ||
| 487 | } | ||
| 488 | |||
| 489 | might_sleep(); | ||
| 490 | for (i = 0; i < pages_per_huge_page(h); i++) { | ||
| 491 | cond_resched(); | ||
| 492 | copy_highpage(dst + i, src + i); | ||
| 493 | } | ||
| 494 | } | ||
| 495 | |||
| 461 | static void enqueue_huge_page(struct hstate *h, struct page *page) | 496 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
| 462 | { | 497 | { |
| 463 | int nid = page_to_nid(page); | 498 | int nid = page_to_nid(page); |
| @@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) | |||
| 466 | h->free_huge_pages_node[nid]++; | 501 | h->free_huge_pages_node[nid]++; |
| 467 | } | 502 | } |
| 468 | 503 | ||
| 504 | static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | ||
| 505 | { | ||
| 506 | struct page *page; | ||
| 507 | |||
| 508 | if (list_empty(&h->hugepage_freelists[nid])) | ||
| 509 | return NULL; | ||
| 510 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); | ||
| 511 | list_del(&page->lru); | ||
| 512 | set_page_refcounted(page); | ||
| 513 | h->free_huge_pages--; | ||
| 514 | h->free_huge_pages_node[nid]--; | ||
| 515 | return page; | ||
| 516 | } | ||
| 517 | |||
| 469 | static struct page *dequeue_huge_page_vma(struct hstate *h, | 518 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
| 470 | struct vm_area_struct *vma, | 519 | struct vm_area_struct *vma, |
| 471 | unsigned long address, int avoid_reserve) | 520 | unsigned long address, int avoid_reserve) |
| 472 | { | 521 | { |
| 473 | int nid; | ||
| 474 | struct page *page = NULL; | 522 | struct page *page = NULL; |
| 475 | struct mempolicy *mpol; | 523 | struct mempolicy *mpol; |
| 476 | nodemask_t *nodemask; | 524 | nodemask_t *nodemask; |
| @@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
| 496 | 544 | ||
| 497 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 545 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
| 498 | MAX_NR_ZONES - 1, nodemask) { | 546 | MAX_NR_ZONES - 1, nodemask) { |
| 499 | nid = zone_to_nid(zone); | 547 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { |
| 500 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && | 548 | page = dequeue_huge_page_node(h, zone_to_nid(zone)); |
| 501 | !list_empty(&h->hugepage_freelists[nid])) { | 549 | if (page) { |
| 502 | page = list_entry(h->hugepage_freelists[nid].next, | 550 | if (!avoid_reserve) |
| 503 | struct page, lru); | 551 | decrement_hugepage_resv_vma(h, vma); |
| 504 | list_del(&page->lru); | 552 | break; |
| 505 | h->free_huge_pages--; | 553 | } |
| 506 | h->free_huge_pages_node[nid]--; | ||
| 507 | |||
| 508 | if (!avoid_reserve) | ||
| 509 | decrement_hugepage_resv_vma(h, vma); | ||
| 510 | |||
| 511 | break; | ||
| 512 | } | 554 | } |
| 513 | } | 555 | } |
| 514 | err: | 556 | err: |
| @@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, | |||
| 770 | return ret; | 812 | return ret; |
| 771 | } | 813 | } |
| 772 | 814 | ||
| 773 | static struct page *alloc_buddy_huge_page(struct hstate *h, | 815 | static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) |
| 774 | struct vm_area_struct *vma, unsigned long address) | ||
| 775 | { | 816 | { |
| 776 | struct page *page; | 817 | struct page *page; |
| 777 | unsigned int nid; | 818 | unsigned int r_nid; |
| 778 | 819 | ||
| 779 | if (h->order >= MAX_ORDER) | 820 | if (h->order >= MAX_ORDER) |
| 780 | return NULL; | 821 | return NULL; |
| @@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
| 812 | } | 853 | } |
| 813 | spin_unlock(&hugetlb_lock); | 854 | spin_unlock(&hugetlb_lock); |
| 814 | 855 | ||
| 815 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| | 856 | if (nid == NUMA_NO_NODE) |
| 816 | __GFP_REPEAT|__GFP_NOWARN, | 857 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| |
| 817 | huge_page_order(h)); | 858 | __GFP_REPEAT|__GFP_NOWARN, |
| 859 | huge_page_order(h)); | ||
| 860 | else | ||
| 861 | page = alloc_pages_exact_node(nid, | ||
| 862 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | ||
| 863 | __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); | ||
| 818 | 864 | ||
| 819 | if (page && arch_prepare_hugepage(page)) { | 865 | if (page && arch_prepare_hugepage(page)) { |
| 820 | __free_pages(page, huge_page_order(h)); | 866 | __free_pages(page, huge_page_order(h)); |
| @@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
| 823 | 869 | ||
| 824 | spin_lock(&hugetlb_lock); | 870 | spin_lock(&hugetlb_lock); |
| 825 | if (page) { | 871 | if (page) { |
| 826 | /* | 872 | r_nid = page_to_nid(page); |
| 827 | * This page is now managed by the hugetlb allocator and has | ||
| 828 | * no users -- drop the buddy allocator's reference. | ||
| 829 | */ | ||
| 830 | put_page_testzero(page); | ||
| 831 | VM_BUG_ON(page_count(page)); | ||
| 832 | nid = page_to_nid(page); | ||
| 833 | set_compound_page_dtor(page, free_huge_page); | 873 | set_compound_page_dtor(page, free_huge_page); |
| 834 | /* | 874 | /* |
| 835 | * We incremented the global counters already | 875 | * We incremented the global counters already |
| 836 | */ | 876 | */ |
| 837 | h->nr_huge_pages_node[nid]++; | 877 | h->nr_huge_pages_node[r_nid]++; |
| 838 | h->surplus_huge_pages_node[nid]++; | 878 | h->surplus_huge_pages_node[r_nid]++; |
| 839 | __count_vm_event(HTLB_BUDDY_PGALLOC); | 879 | __count_vm_event(HTLB_BUDDY_PGALLOC); |
| 840 | } else { | 880 | } else { |
| 841 | h->nr_huge_pages--; | 881 | h->nr_huge_pages--; |
| @@ -848,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
| 848 | } | 888 | } |
| 849 | 889 | ||
| 850 | /* | 890 | /* |
| 891 | * This allocation function is useful in the context where vma is irrelevant. | ||
| 892 | * E.g. soft-offlining uses this function because it only cares physical | ||
| 893 | * address of error page. | ||
| 894 | */ | ||
| 895 | struct page *alloc_huge_page_node(struct hstate *h, int nid) | ||
| 896 | { | ||
| 897 | struct page *page; | ||
| 898 | |||
| 899 | spin_lock(&hugetlb_lock); | ||
| 900 | page = dequeue_huge_page_node(h, nid); | ||
| 901 | spin_unlock(&hugetlb_lock); | ||
| 902 | |||
| 903 | if (!page) | ||
| 904 | page = alloc_buddy_huge_page(h, nid); | ||
| 905 | |||
| 906 | return page; | ||
| 907 | } | ||
| 908 | |||
| 909 | /* | ||
| 851 | * Increase the hugetlb pool such that it can accomodate a reservation | 910 | * Increase the hugetlb pool such that it can accomodate a reservation |
| 852 | * of size 'delta'. | 911 | * of size 'delta'. |
| 853 | */ | 912 | */ |
| @@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta) | |||
| 871 | retry: | 930 | retry: |
| 872 | spin_unlock(&hugetlb_lock); | 931 | spin_unlock(&hugetlb_lock); |
| 873 | for (i = 0; i < needed; i++) { | 932 | for (i = 0; i < needed; i++) { |
| 874 | page = alloc_buddy_huge_page(h, NULL, 0); | 933 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
| 875 | if (!page) { | 934 | if (!page) |
| 876 | /* | 935 | /* |
| 877 | * We were not able to allocate enough pages to | 936 | * We were not able to allocate enough pages to |
| 878 | * satisfy the entire reservation so we free what | 937 | * satisfy the entire reservation so we free what |
| 879 | * we've allocated so far. | 938 | * we've allocated so far. |
| 880 | */ | 939 | */ |
| 881 | spin_lock(&hugetlb_lock); | ||
| 882 | needed = 0; | ||
| 883 | goto free; | 940 | goto free; |
| 884 | } | ||
| 885 | 941 | ||
| 886 | list_add(&page->lru, &surplus_list); | 942 | list_add(&page->lru, &surplus_list); |
| 887 | } | 943 | } |
| @@ -908,31 +964,31 @@ retry: | |||
| 908 | needed += allocated; | 964 | needed += allocated; |
| 909 | h->resv_huge_pages += delta; | 965 | h->resv_huge_pages += delta; |
| 910 | ret = 0; | 966 | ret = 0; |
| 911 | free: | 967 | |
| 968 | spin_unlock(&hugetlb_lock); | ||
| 912 | /* Free the needed pages to the hugetlb pool */ | 969 | /* Free the needed pages to the hugetlb pool */ |
| 913 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 970 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
| 914 | if ((--needed) < 0) | 971 | if ((--needed) < 0) |
| 915 | break; | 972 | break; |
| 916 | list_del(&page->lru); | 973 | list_del(&page->lru); |
| 974 | /* | ||
| 975 | * This page is now managed by the hugetlb allocator and has | ||
| 976 | * no users -- drop the buddy allocator's reference. | ||
| 977 | */ | ||
| 978 | put_page_testzero(page); | ||
| 979 | VM_BUG_ON(page_count(page)); | ||
| 917 | enqueue_huge_page(h, page); | 980 | enqueue_huge_page(h, page); |
| 918 | } | 981 | } |
| 919 | 982 | ||
| 920 | /* Free unnecessary surplus pages to the buddy allocator */ | 983 | /* Free unnecessary surplus pages to the buddy allocator */ |
| 984 | free: | ||
| 921 | if (!list_empty(&surplus_list)) { | 985 | if (!list_empty(&surplus_list)) { |
| 922 | spin_unlock(&hugetlb_lock); | ||
| 923 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 986 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
| 924 | list_del(&page->lru); | 987 | list_del(&page->lru); |
| 925 | /* | 988 | put_page(page); |
| 926 | * The page has a reference count of zero already, so | ||
| 927 | * call free_huge_page directly instead of using | ||
| 928 | * put_page. This must be done with hugetlb_lock | ||
| 929 | * unlocked which is safe because free_huge_page takes | ||
| 930 | * hugetlb_lock before deciding how to free the page. | ||
| 931 | */ | ||
| 932 | free_huge_page(page); | ||
| 933 | } | 989 | } |
| 934 | spin_lock(&hugetlb_lock); | ||
| 935 | } | 990 | } |
| 991 | spin_lock(&hugetlb_lock); | ||
| 936 | 992 | ||
| 937 | return ret; | 993 | return ret; |
| 938 | } | 994 | } |
| @@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
| 1052 | spin_unlock(&hugetlb_lock); | 1108 | spin_unlock(&hugetlb_lock); |
| 1053 | 1109 | ||
| 1054 | if (!page) { | 1110 | if (!page) { |
| 1055 | page = alloc_buddy_huge_page(h, vma, addr); | 1111 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
| 1056 | if (!page) { | 1112 | if (!page) { |
| 1057 | hugetlb_put_quota(inode->i_mapping, chg); | 1113 | hugetlb_put_quota(inode->i_mapping, chg); |
| 1058 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1114 | return ERR_PTR(-VM_FAULT_SIGBUS); |
| 1059 | } | 1115 | } |
| 1060 | } | 1116 | } |
| 1061 | 1117 | ||
| 1062 | set_page_refcounted(page); | ||
| 1063 | set_page_private(page, (unsigned long) mapping); | 1118 | set_page_private(page, (unsigned long) mapping); |
| 1064 | 1119 | ||
| 1065 | vma_commit_reservation(h, vma, addr); | 1120 | vma_commit_reservation(h, vma, addr); |
| @@ -2153,6 +2208,19 @@ nomem: | |||
| 2153 | return -ENOMEM; | 2208 | return -ENOMEM; |
| 2154 | } | 2209 | } |
| 2155 | 2210 | ||
| 2211 | static int is_hugetlb_entry_migration(pte_t pte) | ||
| 2212 | { | ||
| 2213 | swp_entry_t swp; | ||
| 2214 | |||
| 2215 | if (huge_pte_none(pte) || pte_present(pte)) | ||
| 2216 | return 0; | ||
| 2217 | swp = pte_to_swp_entry(pte); | ||
| 2218 | if (non_swap_entry(swp) && is_migration_entry(swp)) { | ||
| 2219 | return 1; | ||
| 2220 | } else | ||
| 2221 | return 0; | ||
| 2222 | } | ||
| 2223 | |||
| 2156 | static int is_hugetlb_entry_hwpoisoned(pte_t pte) | 2224 | static int is_hugetlb_entry_hwpoisoned(pte_t pte) |
| 2157 | { | 2225 | { |
| 2158 | swp_entry_t swp; | 2226 | swp_entry_t swp; |
| @@ -2383,7 +2451,7 @@ retry_avoidcopy: | |||
| 2383 | if (unlikely(anon_vma_prepare(vma))) | 2451 | if (unlikely(anon_vma_prepare(vma))) |
| 2384 | return VM_FAULT_OOM; | 2452 | return VM_FAULT_OOM; |
| 2385 | 2453 | ||
| 2386 | copy_huge_page(new_page, old_page, address, vma); | 2454 | copy_user_huge_page(new_page, old_page, address, vma); |
| 2387 | __SetPageUptodate(new_page); | 2455 | __SetPageUptodate(new_page); |
| 2388 | 2456 | ||
| 2389 | /* | 2457 | /* |
| @@ -2515,22 +2583,20 @@ retry: | |||
| 2515 | hugepage_add_new_anon_rmap(page, vma, address); | 2583 | hugepage_add_new_anon_rmap(page, vma, address); |
| 2516 | } | 2584 | } |
| 2517 | } else { | 2585 | } else { |
| 2586 | /* | ||
| 2587 | * If memory error occurs between mmap() and fault, some process | ||
| 2588 | * don't have hwpoisoned swap entry for errored virtual address. | ||
| 2589 | * So we need to block hugepage fault by PG_hwpoison bit check. | ||
| 2590 | */ | ||
| 2591 | if (unlikely(PageHWPoison(page))) { | ||
| 2592 | ret = VM_FAULT_HWPOISON | | ||
| 2593 | VM_FAULT_SET_HINDEX(h - hstates); | ||
| 2594 | goto backout_unlocked; | ||
| 2595 | } | ||
| 2518 | page_dup_rmap(page); | 2596 | page_dup_rmap(page); |
| 2519 | } | 2597 | } |
| 2520 | 2598 | ||
| 2521 | /* | 2599 | /* |
| 2522 | * Since memory error handler replaces pte into hwpoison swap entry | ||
| 2523 | * at the time of error handling, a process which reserved but not have | ||
| 2524 | * the mapping to the error hugepage does not have hwpoison swap entry. | ||
| 2525 | * So we need to block accesses from such a process by checking | ||
| 2526 | * PG_hwpoison bit here. | ||
| 2527 | */ | ||
| 2528 | if (unlikely(PageHWPoison(page))) { | ||
| 2529 | ret = VM_FAULT_HWPOISON; | ||
| 2530 | goto backout_unlocked; | ||
| 2531 | } | ||
| 2532 | |||
| 2533 | /* | ||
| 2534 | * If we are going to COW a private mapping later, we examine the | 2600 | * If we are going to COW a private mapping later, we examine the |
| 2535 | * pending reservations for this page now. This will ensure that | 2601 | * pending reservations for this page now. This will ensure that |
| 2536 | * any allocations necessary to record that reservation occur outside | 2602 | * any allocations necessary to record that reservation occur outside |
| @@ -2587,8 +2653,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2587 | ptep = huge_pte_offset(mm, address); | 2653 | ptep = huge_pte_offset(mm, address); |
| 2588 | if (ptep) { | 2654 | if (ptep) { |
| 2589 | entry = huge_ptep_get(ptep); | 2655 | entry = huge_ptep_get(ptep); |
| 2590 | if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 2656 | if (unlikely(is_hugetlb_entry_migration(entry))) { |
| 2591 | return VM_FAULT_HWPOISON; | 2657 | migration_entry_wait(mm, (pmd_t *)ptep, address); |
| 2658 | return 0; | ||
| 2659 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | ||
| 2660 | return VM_FAULT_HWPOISON_LARGE | | ||
| 2661 | VM_FAULT_SET_HINDEX(h - hstates); | ||
| 2592 | } | 2662 | } |
| 2593 | 2663 | ||
| 2594 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); | 2664 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
| @@ -2878,18 +2948,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
| 2878 | hugetlb_acct_memory(h, -(chg - freed)); | 2948 | hugetlb_acct_memory(h, -(chg - freed)); |
| 2879 | } | 2949 | } |
| 2880 | 2950 | ||
| 2951 | #ifdef CONFIG_MEMORY_FAILURE | ||
| 2952 | |||
| 2953 | /* Should be called in hugetlb_lock */ | ||
| 2954 | static int is_hugepage_on_freelist(struct page *hpage) | ||
| 2955 | { | ||
| 2956 | struct page *page; | ||
| 2957 | struct page *tmp; | ||
| 2958 | struct hstate *h = page_hstate(hpage); | ||
| 2959 | int nid = page_to_nid(hpage); | ||
| 2960 | |||
| 2961 | list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) | ||
| 2962 | if (page == hpage) | ||
| 2963 | return 1; | ||
| 2964 | return 0; | ||
| 2965 | } | ||
| 2966 | |||
| 2881 | /* | 2967 | /* |
| 2882 | * This function is called from memory failure code. | 2968 | * This function is called from memory failure code. |
| 2883 | * Assume the caller holds page lock of the head page. | 2969 | * Assume the caller holds page lock of the head page. |
| 2884 | */ | 2970 | */ |
| 2885 | void __isolate_hwpoisoned_huge_page(struct page *hpage) | 2971 | int dequeue_hwpoisoned_huge_page(struct page *hpage) |
| 2886 | { | 2972 | { |
| 2887 | struct hstate *h = page_hstate(hpage); | 2973 | struct hstate *h = page_hstate(hpage); |
| 2888 | int nid = page_to_nid(hpage); | 2974 | int nid = page_to_nid(hpage); |
| 2975 | int ret = -EBUSY; | ||
| 2889 | 2976 | ||
| 2890 | spin_lock(&hugetlb_lock); | 2977 | spin_lock(&hugetlb_lock); |
| 2891 | list_del(&hpage->lru); | 2978 | if (is_hugepage_on_freelist(hpage)) { |
| 2892 | h->free_huge_pages--; | 2979 | list_del(&hpage->lru); |
| 2893 | h->free_huge_pages_node[nid]--; | 2980 | set_page_refcounted(hpage); |
| 2981 | h->free_huge_pages--; | ||
| 2982 | h->free_huge_pages_node[nid]--; | ||
| 2983 | ret = 0; | ||
| 2984 | } | ||
| 2894 | spin_unlock(&hugetlb_lock); | 2985 | spin_unlock(&hugetlb_lock); |
| 2986 | return ret; | ||
| 2895 | } | 2987 | } |
| 2988 | #endif | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2044fe8920c2..44a8cefeae6e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
| @@ -697,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) | |||
| 697 | * Issues: | 697 | * Issues: |
| 698 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) | 698 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) |
| 699 | * To narrow down kill region to one page, we need to break up pmd. | 699 | * To narrow down kill region to one page, we need to break up pmd. |
| 700 | * - To support soft-offlining for hugepage, we need to support hugepage | ||
| 701 | * migration. | ||
| 702 | */ | 700 | */ |
| 703 | static int me_huge_page(struct page *p, unsigned long pfn) | 701 | static int me_huge_page(struct page *p, unsigned long pfn) |
| 704 | { | 702 | { |
| 703 | int res = 0; | ||
| 705 | struct page *hpage = compound_head(p); | 704 | struct page *hpage = compound_head(p); |
| 706 | /* | 705 | /* |
| 707 | * We can safely recover from error on free or reserved (i.e. | 706 | * We can safely recover from error on free or reserved (i.e. |
| @@ -714,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
| 714 | * so there is no race between isolation and mapping/unmapping. | 713 | * so there is no race between isolation and mapping/unmapping. |
| 715 | */ | 714 | */ |
| 716 | if (!(page_mapping(hpage) || PageAnon(hpage))) { | 715 | if (!(page_mapping(hpage) || PageAnon(hpage))) { |
| 717 | __isolate_hwpoisoned_huge_page(hpage); | 716 | res = dequeue_hwpoisoned_huge_page(hpage); |
| 718 | return RECOVERED; | 717 | if (!res) |
| 718 | return RECOVERED; | ||
| 719 | } | 719 | } |
| 720 | return DELAYED; | 720 | return DELAYED; |
| 721 | } | 721 | } |
| @@ -972,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 972 | * We need/can do nothing about count=0 pages. | 972 | * We need/can do nothing about count=0 pages. |
| 973 | * 1) it's a free page, and therefore in safe hand: | 973 | * 1) it's a free page, and therefore in safe hand: |
| 974 | * prep_new_page() will be the gate keeper. | 974 | * prep_new_page() will be the gate keeper. |
| 975 | * 2) it's part of a non-compound high order page. | 975 | * 2) it's a free hugepage, which is also safe: |
| 976 | * an affected hugepage will be dequeued from hugepage freelist, | ||
| 977 | * so there's no concern about reusing it ever after. | ||
| 978 | * 3) it's part of a non-compound high order page. | ||
| 976 | * Implies some kernel user: cannot stop them from | 979 | * Implies some kernel user: cannot stop them from |
| 977 | * R/W the page; let's pray that the page has been | 980 | * R/W the page; let's pray that the page has been |
| 978 | * used and will be freed some time later. | 981 | * used and will be freed some time later. |
| @@ -984,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 984 | if (is_free_buddy_page(p)) { | 987 | if (is_free_buddy_page(p)) { |
| 985 | action_result(pfn, "free buddy", DELAYED); | 988 | action_result(pfn, "free buddy", DELAYED); |
| 986 | return 0; | 989 | return 0; |
| 990 | } else if (PageHuge(hpage)) { | ||
| 991 | /* | ||
| 992 | * Check "just unpoisoned", "filter hit", and | ||
| 993 | * "race with other subpage." | ||
| 994 | */ | ||
| 995 | lock_page_nosync(hpage); | ||
| 996 | if (!PageHWPoison(hpage) | ||
| 997 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) | ||
| 998 | || (p != hpage && TestSetPageHWPoison(hpage))) { | ||
| 999 | atomic_long_sub(nr_pages, &mce_bad_pages); | ||
| 1000 | return 0; | ||
| 1001 | } | ||
| 1002 | set_page_hwpoison_huge_page(hpage); | ||
| 1003 | res = dequeue_hwpoisoned_huge_page(hpage); | ||
| 1004 | action_result(pfn, "free huge", | ||
| 1005 | res ? IGNORED : DELAYED); | ||
| 1006 | unlock_page(hpage); | ||
| 1007 | return res; | ||
| 987 | } else { | 1008 | } else { |
| 988 | action_result(pfn, "high order kernel", IGNORED); | 1009 | action_result(pfn, "high order kernel", IGNORED); |
| 989 | return -EBUSY; | 1010 | return -EBUSY; |
| @@ -1145,6 +1166,16 @@ int unpoison_memory(unsigned long pfn) | |||
| 1145 | nr_pages = 1 << compound_order(page); | 1166 | nr_pages = 1 << compound_order(page); |
| 1146 | 1167 | ||
| 1147 | if (!get_page_unless_zero(page)) { | 1168 | if (!get_page_unless_zero(page)) { |
| 1169 | /* | ||
| 1170 | * Since HWPoisoned hugepage should have non-zero refcount, | ||
| 1171 | * race between memory failure and unpoison seems to happen. | ||
| 1172 | * In such case unpoison fails and memory failure runs | ||
| 1173 | * to the end. | ||
| 1174 | */ | ||
| 1175 | if (PageHuge(page)) { | ||
| 1176 | pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); | ||
| 1177 | return 0; | ||
| 1178 | } | ||
| 1148 | if (TestClearPageHWPoison(p)) | 1179 | if (TestClearPageHWPoison(p)) |
| 1149 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1180 | atomic_long_sub(nr_pages, &mce_bad_pages); |
| 1150 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); | 1181 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); |
| @@ -1162,9 +1193,9 @@ int unpoison_memory(unsigned long pfn) | |||
| 1162 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); | 1193 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); |
| 1163 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1194 | atomic_long_sub(nr_pages, &mce_bad_pages); |
| 1164 | freeit = 1; | 1195 | freeit = 1; |
| 1196 | if (PageHuge(page)) | ||
| 1197 | clear_page_hwpoison_huge_page(page); | ||
| 1165 | } | 1198 | } |
| 1166 | if (PageHuge(p)) | ||
| 1167 | clear_page_hwpoison_huge_page(page); | ||
| 1168 | unlock_page(page); | 1199 | unlock_page(page); |
| 1169 | 1200 | ||
| 1170 | put_page(page); | 1201 | put_page(page); |
| @@ -1178,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory); | |||
| 1178 | static struct page *new_page(struct page *p, unsigned long private, int **x) | 1209 | static struct page *new_page(struct page *p, unsigned long private, int **x) |
| 1179 | { | 1210 | { |
| 1180 | int nid = page_to_nid(p); | 1211 | int nid = page_to_nid(p); |
| 1181 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | 1212 | if (PageHuge(p)) |
| 1213 | return alloc_huge_page_node(page_hstate(compound_head(p)), | ||
| 1214 | nid); | ||
| 1215 | else | ||
| 1216 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | ||
| 1182 | } | 1217 | } |
| 1183 | 1218 | ||
| 1184 | /* | 1219 | /* |
| @@ -1206,8 +1241,15 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
| 1206 | * was free. | 1241 | * was free. |
| 1207 | */ | 1242 | */ |
| 1208 | set_migratetype_isolate(p); | 1243 | set_migratetype_isolate(p); |
| 1244 | /* | ||
| 1245 | * When the target page is a free hugepage, just remove it | ||
| 1246 | * from free hugepage list. | ||
| 1247 | */ | ||
| 1209 | if (!get_page_unless_zero(compound_head(p))) { | 1248 | if (!get_page_unless_zero(compound_head(p))) { |
| 1210 | if (is_free_buddy_page(p)) { | 1249 | if (PageHuge(p)) { |
| 1250 | pr_info("get_any_page: %#lx free huge page\n", pfn); | ||
| 1251 | ret = dequeue_hwpoisoned_huge_page(compound_head(p)); | ||
| 1252 | } else if (is_free_buddy_page(p)) { | ||
| 1211 | pr_info("get_any_page: %#lx free buddy page\n", pfn); | 1253 | pr_info("get_any_page: %#lx free buddy page\n", pfn); |
| 1212 | /* Set hwpoison bit while page is still isolated */ | 1254 | /* Set hwpoison bit while page is still isolated */ |
| 1213 | SetPageHWPoison(p); | 1255 | SetPageHWPoison(p); |
| @@ -1226,6 +1268,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
| 1226 | return ret; | 1268 | return ret; |
| 1227 | } | 1269 | } |
| 1228 | 1270 | ||
| 1271 | static int soft_offline_huge_page(struct page *page, int flags) | ||
| 1272 | { | ||
| 1273 | int ret; | ||
| 1274 | unsigned long pfn = page_to_pfn(page); | ||
| 1275 | struct page *hpage = compound_head(page); | ||
| 1276 | LIST_HEAD(pagelist); | ||
| 1277 | |||
| 1278 | ret = get_any_page(page, pfn, flags); | ||
| 1279 | if (ret < 0) | ||
| 1280 | return ret; | ||
| 1281 | if (ret == 0) | ||
| 1282 | goto done; | ||
| 1283 | |||
| 1284 | if (PageHWPoison(hpage)) { | ||
| 1285 | put_page(hpage); | ||
| 1286 | pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); | ||
| 1287 | return -EBUSY; | ||
| 1288 | } | ||
| 1289 | |||
| 1290 | /* Keep page count to indicate a given hugepage is isolated. */ | ||
| 1291 | |||
| 1292 | list_add(&hpage->lru, &pagelist); | ||
| 1293 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | ||
| 1294 | if (ret) { | ||
| 1295 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | ||
| 1296 | pfn, ret, page->flags); | ||
| 1297 | if (ret > 0) | ||
| 1298 | ret = -EIO; | ||
| 1299 | return ret; | ||
| 1300 | } | ||
| 1301 | done: | ||
| 1302 | if (!PageHWPoison(hpage)) | ||
| 1303 | atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); | ||
| 1304 | set_page_hwpoison_huge_page(hpage); | ||
| 1305 | dequeue_hwpoisoned_huge_page(hpage); | ||
| 1306 | /* keep elevated page count for bad page */ | ||
| 1307 | return ret; | ||
| 1308 | } | ||
| 1309 | |||
| 1229 | /** | 1310 | /** |
| 1230 | * soft_offline_page - Soft offline a page. | 1311 | * soft_offline_page - Soft offline a page. |
| 1231 | * @page: page to offline | 1312 | * @page: page to offline |
| @@ -1253,6 +1334,9 @@ int soft_offline_page(struct page *page, int flags) | |||
| 1253 | int ret; | 1334 | int ret; |
| 1254 | unsigned long pfn = page_to_pfn(page); | 1335 | unsigned long pfn = page_to_pfn(page); |
| 1255 | 1336 | ||
| 1337 | if (PageHuge(page)) | ||
| 1338 | return soft_offline_huge_page(page, flags); | ||
| 1339 | |||
| 1256 | ret = get_any_page(page, pfn, flags); | 1340 | ret = get_any_page(page, pfn, flags); |
| 1257 | if (ret < 0) | 1341 | if (ret < 0) |
| 1258 | return ret; | 1342 | return ret; |
diff --git a/mm/memory.c b/mm/memory.c index 98b58fecedef..af82741caaa4 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1450 | if (ret & VM_FAULT_OOM) | 1450 | if (ret & VM_FAULT_OOM) |
| 1451 | return i ? i : -ENOMEM; | 1451 | return i ? i : -ENOMEM; |
| 1452 | if (ret & | 1452 | if (ret & |
| 1453 | (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) | 1453 | (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| |
| 1454 | VM_FAULT_SIGBUS)) | ||
| 1454 | return i ? i : -EFAULT; | 1455 | return i ? i : -EFAULT; |
| 1455 | BUG(); | 1456 | BUG(); |
| 1456 | } | 1457 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 38e7cad782f4..f8c9bccf2520 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -32,6 +32,7 @@ | |||
| 32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
| 33 | #include <linux/memcontrol.h> | 33 | #include <linux/memcontrol.h> |
| 34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
| 35 | #include <linux/hugetlb.h> | ||
| 35 | #include <linux/gfp.h> | 36 | #include <linux/gfp.h> |
| 36 | 37 | ||
| 37 | #include "internal.h" | 38 | #include "internal.h" |
| @@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
| 95 | pte_t *ptep, pte; | 96 | pte_t *ptep, pte; |
| 96 | spinlock_t *ptl; | 97 | spinlock_t *ptl; |
| 97 | 98 | ||
| 98 | pgd = pgd_offset(mm, addr); | 99 | if (unlikely(PageHuge(new))) { |
| 99 | if (!pgd_present(*pgd)) | 100 | ptep = huge_pte_offset(mm, addr); |
| 100 | goto out; | 101 | if (!ptep) |
| 102 | goto out; | ||
| 103 | ptl = &mm->page_table_lock; | ||
| 104 | } else { | ||
| 105 | pgd = pgd_offset(mm, addr); | ||
| 106 | if (!pgd_present(*pgd)) | ||
| 107 | goto out; | ||
| 101 | 108 | ||
| 102 | pud = pud_offset(pgd, addr); | 109 | pud = pud_offset(pgd, addr); |
| 103 | if (!pud_present(*pud)) | 110 | if (!pud_present(*pud)) |
| 104 | goto out; | 111 | goto out; |
| 105 | 112 | ||
| 106 | pmd = pmd_offset(pud, addr); | 113 | pmd = pmd_offset(pud, addr); |
| 107 | if (!pmd_present(*pmd)) | 114 | if (!pmd_present(*pmd)) |
| 108 | goto out; | 115 | goto out; |
| 109 | 116 | ||
| 110 | ptep = pte_offset_map(pmd, addr); | 117 | ptep = pte_offset_map(pmd, addr); |
| 111 | 118 | ||
| 112 | if (!is_swap_pte(*ptep)) { | 119 | if (!is_swap_pte(*ptep)) { |
| 113 | pte_unmap(ptep); | 120 | pte_unmap(ptep); |
| 114 | goto out; | 121 | goto out; |
| 115 | } | 122 | } |
| 123 | |||
| 124 | ptl = pte_lockptr(mm, pmd); | ||
| 125 | } | ||
| 116 | 126 | ||
| 117 | ptl = pte_lockptr(mm, pmd); | ||
| 118 | spin_lock(ptl); | 127 | spin_lock(ptl); |
| 119 | pte = *ptep; | 128 | pte = *ptep; |
| 120 | if (!is_swap_pte(pte)) | 129 | if (!is_swap_pte(pte)) |
| @@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
| 130 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 139 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
| 131 | if (is_write_migration_entry(entry)) | 140 | if (is_write_migration_entry(entry)) |
| 132 | pte = pte_mkwrite(pte); | 141 | pte = pte_mkwrite(pte); |
| 142 | #ifdef CONFIG_HUGETLB_PAGE | ||
| 143 | if (PageHuge(new)) | ||
| 144 | pte = pte_mkhuge(pte); | ||
| 145 | #endif | ||
| 133 | flush_cache_page(vma, addr, pte_pfn(pte)); | 146 | flush_cache_page(vma, addr, pte_pfn(pte)); |
| 134 | set_pte_at(mm, addr, ptep, pte); | 147 | set_pte_at(mm, addr, ptep, pte); |
| 135 | 148 | ||
| 136 | if (PageAnon(new)) | 149 | if (PageHuge(new)) { |
| 150 | if (PageAnon(new)) | ||
| 151 | hugepage_add_anon_rmap(new, vma, addr); | ||
| 152 | else | ||
| 153 | page_dup_rmap(new); | ||
| 154 | } else if (PageAnon(new)) | ||
| 137 | page_add_anon_rmap(new, vma, addr); | 155 | page_add_anon_rmap(new, vma, addr); |
| 138 | else | 156 | else |
| 139 | page_add_file_rmap(new); | 157 | page_add_file_rmap(new); |
| @@ -276,11 +294,59 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
| 276 | } | 294 | } |
| 277 | 295 | ||
| 278 | /* | 296 | /* |
| 297 | * The expected number of remaining references is the same as that | ||
| 298 | * of migrate_page_move_mapping(). | ||
| 299 | */ | ||
| 300 | int migrate_huge_page_move_mapping(struct address_space *mapping, | ||
| 301 | struct page *newpage, struct page *page) | ||
| 302 | { | ||
| 303 | int expected_count; | ||
| 304 | void **pslot; | ||
| 305 | |||
| 306 | if (!mapping) { | ||
| 307 | if (page_count(page) != 1) | ||
| 308 | return -EAGAIN; | ||
| 309 | return 0; | ||
| 310 | } | ||
| 311 | |||
| 312 | spin_lock_irq(&mapping->tree_lock); | ||
| 313 | |||
| 314 | pslot = radix_tree_lookup_slot(&mapping->page_tree, | ||
| 315 | page_index(page)); | ||
| 316 | |||
| 317 | expected_count = 2 + page_has_private(page); | ||
| 318 | if (page_count(page) != expected_count || | ||
| 319 | (struct page *)radix_tree_deref_slot(pslot) != page) { | ||
| 320 | spin_unlock_irq(&mapping->tree_lock); | ||
| 321 | return -EAGAIN; | ||
| 322 | } | ||
| 323 | |||
| 324 | if (!page_freeze_refs(page, expected_count)) { | ||
| 325 | spin_unlock_irq(&mapping->tree_lock); | ||
| 326 | return -EAGAIN; | ||
| 327 | } | ||
| 328 | |||
| 329 | get_page(newpage); | ||
| 330 | |||
| 331 | radix_tree_replace_slot(pslot, newpage); | ||
| 332 | |||
| 333 | page_unfreeze_refs(page, expected_count); | ||
| 334 | |||
| 335 | __put_page(page); | ||
| 336 | |||
| 337 | spin_unlock_irq(&mapping->tree_lock); | ||
| 338 | return 0; | ||
| 339 | } | ||
| 340 | |||
| 341 | /* | ||
| 279 | * Copy the page to its new location | 342 | * Copy the page to its new location |
| 280 | */ | 343 | */ |
| 281 | static void migrate_page_copy(struct page *newpage, struct page *page) | 344 | void migrate_page_copy(struct page *newpage, struct page *page) |
| 282 | { | 345 | { |
| 283 | copy_highpage(newpage, page); | 346 | if (PageHuge(page)) |
| 347 | copy_huge_page(newpage, page); | ||
| 348 | else | ||
| 349 | copy_highpage(newpage, page); | ||
| 284 | 350 | ||
| 285 | if (PageError(page)) | 351 | if (PageError(page)) |
| 286 | SetPageError(newpage); | 352 | SetPageError(newpage); |
| @@ -724,6 +790,92 @@ move_newpage: | |||
| 724 | } | 790 | } |
| 725 | 791 | ||
| 726 | /* | 792 | /* |
| 793 | * Counterpart of unmap_and_move_page() for hugepage migration. | ||
| 794 | * | ||
| 795 | * This function doesn't wait the completion of hugepage I/O | ||
| 796 | * because there is no race between I/O and migration for hugepage. | ||
| 797 | * Note that currently hugepage I/O occurs only in direct I/O | ||
| 798 | * where no lock is held and PG_writeback is irrelevant, | ||
| 799 | * and writeback status of all subpages are counted in the reference | ||
| 800 | * count of the head page (i.e. if all subpages of a 2MB hugepage are | ||
| 801 | * under direct I/O, the reference of the head page is 512 and a bit more.) | ||
| 802 | * This means that when we try to migrate hugepage whose subpages are | ||
| 803 | * doing direct I/O, some references remain after try_to_unmap() and | ||
| 804 | * hugepage migration fails without data corruption. | ||
| 805 | * | ||
| 806 | * There is also no race when direct I/O is issued on the page under migration, | ||
| 807 | * because then pte is replaced with migration swap entry and direct I/O code | ||
| 808 | * will wait in the page fault for migration to complete. | ||
| 809 | */ | ||
| 810 | static int unmap_and_move_huge_page(new_page_t get_new_page, | ||
| 811 | unsigned long private, struct page *hpage, | ||
| 812 | int force, int offlining) | ||
| 813 | { | ||
| 814 | int rc = 0; | ||
| 815 | int *result = NULL; | ||
| 816 | struct page *new_hpage = get_new_page(hpage, private, &result); | ||
| 817 | int rcu_locked = 0; | ||
| 818 | struct anon_vma *anon_vma = NULL; | ||
| 819 | |||
| 820 | if (!new_hpage) | ||
| 821 | return -ENOMEM; | ||
| 822 | |||
| 823 | rc = -EAGAIN; | ||
| 824 | |||
| 825 | if (!trylock_page(hpage)) { | ||
| 826 | if (!force) | ||
| 827 | goto out; | ||
| 828 | lock_page(hpage); | ||
| 829 | } | ||
| 830 | |||
| 831 | if (PageAnon(hpage)) { | ||
| 832 | rcu_read_lock(); | ||
| 833 | rcu_locked = 1; | ||
| 834 | |||
| 835 | if (page_mapped(hpage)) { | ||
| 836 | anon_vma = page_anon_vma(hpage); | ||
| 837 | atomic_inc(&anon_vma->external_refcount); | ||
| 838 | } | ||
| 839 | } | ||
| 840 | |||
| 841 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | ||
| 842 | |||
| 843 | if (!page_mapped(hpage)) | ||
| 844 | rc = move_to_new_page(new_hpage, hpage, 1); | ||
| 845 | |||
| 846 | if (rc) | ||
| 847 | remove_migration_ptes(hpage, hpage); | ||
| 848 | |||
| 849 | if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, | ||
| 850 | &anon_vma->lock)) { | ||
| 851 | int empty = list_empty(&anon_vma->head); | ||
| 852 | spin_unlock(&anon_vma->lock); | ||
| 853 | if (empty) | ||
| 854 | anon_vma_free(anon_vma); | ||
| 855 | } | ||
| 856 | |||
| 857 | if (rcu_locked) | ||
| 858 | rcu_read_unlock(); | ||
| 859 | out: | ||
| 860 | unlock_page(hpage); | ||
| 861 | |||
| 862 | if (rc != -EAGAIN) { | ||
| 863 | list_del(&hpage->lru); | ||
| 864 | put_page(hpage); | ||
| 865 | } | ||
| 866 | |||
| 867 | put_page(new_hpage); | ||
| 868 | |||
| 869 | if (result) { | ||
| 870 | if (rc) | ||
| 871 | *result = rc; | ||
| 872 | else | ||
| 873 | *result = page_to_nid(new_hpage); | ||
| 874 | } | ||
| 875 | return rc; | ||
| 876 | } | ||
| 877 | |||
| 878 | /* | ||
| 727 | * migrate_pages | 879 | * migrate_pages |
| 728 | * | 880 | * |
| 729 | * The function takes one list of pages to migrate and a function | 881 | * The function takes one list of pages to migrate and a function |
| @@ -788,6 +940,52 @@ out: | |||
| 788 | return nr_failed + retry; | 940 | return nr_failed + retry; |
| 789 | } | 941 | } |
| 790 | 942 | ||
| 943 | int migrate_huge_pages(struct list_head *from, | ||
| 944 | new_page_t get_new_page, unsigned long private, int offlining) | ||
| 945 | { | ||
| 946 | int retry = 1; | ||
| 947 | int nr_failed = 0; | ||
| 948 | int pass = 0; | ||
| 949 | struct page *page; | ||
| 950 | struct page *page2; | ||
| 951 | int rc; | ||
| 952 | |||
| 953 | for (pass = 0; pass < 10 && retry; pass++) { | ||
| 954 | retry = 0; | ||
| 955 | |||
| 956 | list_for_each_entry_safe(page, page2, from, lru) { | ||
| 957 | cond_resched(); | ||
| 958 | |||
| 959 | rc = unmap_and_move_huge_page(get_new_page, | ||
| 960 | private, page, pass > 2, offlining); | ||
| 961 | |||
| 962 | switch(rc) { | ||
| 963 | case -ENOMEM: | ||
| 964 | goto out; | ||
| 965 | case -EAGAIN: | ||
| 966 | retry++; | ||
| 967 | break; | ||
| 968 | case 0: | ||
| 969 | break; | ||
| 970 | default: | ||
| 971 | /* Permanent failure */ | ||
| 972 | nr_failed++; | ||
| 973 | break; | ||
| 974 | } | ||
| 975 | } | ||
| 976 | } | ||
| 977 | rc = 0; | ||
| 978 | out: | ||
| 979 | |||
| 980 | list_for_each_entry_safe(page, page2, from, lru) | ||
| 981 | put_page(page); | ||
| 982 | |||
| 983 | if (rc) | ||
| 984 | return rc; | ||
| 985 | |||
| 986 | return nr_failed + retry; | ||
| 987 | } | ||
| 988 | |||
| 791 | #ifdef CONFIG_NUMA | 989 | #ifdef CONFIG_NUMA |
| 792 | /* | 990 | /* |
| 793 | * Move a list of individual pages | 991 | * Move a list of individual pages |
| @@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page, | |||
| 780 | } | 780 | } |
| 781 | 781 | ||
| 782 | /** | 782 | /** |
| 783 | * __page_set_anon_rmap - setup new anonymous rmap | 783 | * __page_set_anon_rmap - set up new anonymous rmap |
| 784 | * @page: the page to add the mapping to | 784 | * @page: Page to add to rmap |
| 785 | * @vma: the vm area in which the mapping is added | 785 | * @vma: VM area to add page to. |
| 786 | * @address: the user virtual address mapped | 786 | * @address: User virtual address of the mapping |
| 787 | * @exclusive: the page is exclusively owned by the current process | 787 | * @exclusive: the page is exclusively owned by the current process |
| 788 | */ | 788 | */ |
| 789 | static void __page_set_anon_rmap(struct page *page, | 789 | static void __page_set_anon_rmap(struct page *page, |
| @@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page, | |||
| 793 | 793 | ||
| 794 | BUG_ON(!anon_vma); | 794 | BUG_ON(!anon_vma); |
| 795 | 795 | ||
| 796 | if (PageAnon(page)) | ||
| 797 | return; | ||
| 798 | |||
| 796 | /* | 799 | /* |
| 797 | * If the page isn't exclusively mapped into this vma, | 800 | * If the page isn't exclusively mapped into this vma, |
| 798 | * we must use the _oldest_ possible anon_vma for the | 801 | * we must use the _oldest_ possible anon_vma for the |
| 799 | * page mapping! | 802 | * page mapping! |
| 800 | */ | 803 | */ |
| 801 | if (!exclusive) { | 804 | if (!exclusive) |
| 802 | if (PageAnon(page)) | ||
| 803 | return; | ||
| 804 | anon_vma = anon_vma->root; | 805 | anon_vma = anon_vma->root; |
| 805 | } else { | ||
| 806 | /* | ||
| 807 | * In this case, swapped-out-but-not-discarded swap-cache | ||
| 808 | * is remapped. So, no need to update page->mapping here. | ||
| 809 | * We convice anon_vma poitned by page->mapping is not obsolete | ||
| 810 | * because vma->anon_vma is necessary to be a family of it. | ||
| 811 | */ | ||
| 812 | if (PageAnon(page)) | ||
| 813 | return; | ||
| 814 | } | ||
| 815 | 806 | ||
| 816 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 807 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
| 817 | page->mapping = (struct address_space *) anon_vma; | 808 | page->mapping = (struct address_space *) anon_vma; |
