diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 233 |
1 files changed, 163 insertions, 70 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c03273807182..96991ded82fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page, | |||
423 | } | 423 | } |
424 | } | 424 | } |
425 | 425 | ||
426 | static void copy_gigantic_page(struct page *dst, struct page *src, | 426 | static void copy_user_gigantic_page(struct page *dst, struct page *src, |
427 | unsigned long addr, struct vm_area_struct *vma) | 427 | unsigned long addr, struct vm_area_struct *vma) |
428 | { | 428 | { |
429 | int i; | 429 | int i; |
430 | struct hstate *h = hstate_vma(vma); | 430 | struct hstate *h = hstate_vma(vma); |
431 | struct page *dst_base = dst; | 431 | struct page *dst_base = dst; |
432 | struct page *src_base = src; | 432 | struct page *src_base = src; |
433 | might_sleep(); | 433 | |
434 | for (i = 0; i < pages_per_huge_page(h); ) { | 434 | for (i = 0; i < pages_per_huge_page(h); ) { |
435 | cond_resched(); | 435 | cond_resched(); |
436 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | 436 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); |
@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src, | |||
440 | src = mem_map_next(src, src_base, i); | 440 | src = mem_map_next(src, src_base, i); |
441 | } | 441 | } |
442 | } | 442 | } |
443 | static void copy_huge_page(struct page *dst, struct page *src, | 443 | |
444 | static void copy_user_huge_page(struct page *dst, struct page *src, | ||
444 | unsigned long addr, struct vm_area_struct *vma) | 445 | unsigned long addr, struct vm_area_struct *vma) |
445 | { | 446 | { |
446 | int i; | 447 | int i; |
447 | struct hstate *h = hstate_vma(vma); | 448 | struct hstate *h = hstate_vma(vma); |
448 | 449 | ||
449 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { | 450 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { |
450 | copy_gigantic_page(dst, src, addr, vma); | 451 | copy_user_gigantic_page(dst, src, addr, vma); |
451 | return; | 452 | return; |
452 | } | 453 | } |
453 | 454 | ||
@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src, | |||
458 | } | 459 | } |
459 | } | 460 | } |
460 | 461 | ||
462 | static void copy_gigantic_page(struct page *dst, struct page *src) | ||
463 | { | ||
464 | int i; | ||
465 | struct hstate *h = page_hstate(src); | ||
466 | struct page *dst_base = dst; | ||
467 | struct page *src_base = src; | ||
468 | |||
469 | for (i = 0; i < pages_per_huge_page(h); ) { | ||
470 | cond_resched(); | ||
471 | copy_highpage(dst, src); | ||
472 | |||
473 | i++; | ||
474 | dst = mem_map_next(dst, dst_base, i); | ||
475 | src = mem_map_next(src, src_base, i); | ||
476 | } | ||
477 | } | ||
478 | |||
479 | void copy_huge_page(struct page *dst, struct page *src) | ||
480 | { | ||
481 | int i; | ||
482 | struct hstate *h = page_hstate(src); | ||
483 | |||
484 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { | ||
485 | copy_gigantic_page(dst, src); | ||
486 | return; | ||
487 | } | ||
488 | |||
489 | might_sleep(); | ||
490 | for (i = 0; i < pages_per_huge_page(h); i++) { | ||
491 | cond_resched(); | ||
492 | copy_highpage(dst + i, src + i); | ||
493 | } | ||
494 | } | ||
495 | |||
461 | static void enqueue_huge_page(struct hstate *h, struct page *page) | 496 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
462 | { | 497 | { |
463 | int nid = page_to_nid(page); | 498 | int nid = page_to_nid(page); |
@@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) | |||
466 | h->free_huge_pages_node[nid]++; | 501 | h->free_huge_pages_node[nid]++; |
467 | } | 502 | } |
468 | 503 | ||
504 | static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | ||
505 | { | ||
506 | struct page *page; | ||
507 | |||
508 | if (list_empty(&h->hugepage_freelists[nid])) | ||
509 | return NULL; | ||
510 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); | ||
511 | list_del(&page->lru); | ||
512 | set_page_refcounted(page); | ||
513 | h->free_huge_pages--; | ||
514 | h->free_huge_pages_node[nid]--; | ||
515 | return page; | ||
516 | } | ||
517 | |||
469 | static struct page *dequeue_huge_page_vma(struct hstate *h, | 518 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
470 | struct vm_area_struct *vma, | 519 | struct vm_area_struct *vma, |
471 | unsigned long address, int avoid_reserve) | 520 | unsigned long address, int avoid_reserve) |
472 | { | 521 | { |
473 | int nid; | ||
474 | struct page *page = NULL; | 522 | struct page *page = NULL; |
475 | struct mempolicy *mpol; | 523 | struct mempolicy *mpol; |
476 | nodemask_t *nodemask; | 524 | nodemask_t *nodemask; |
@@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
496 | 544 | ||
497 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 545 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
498 | MAX_NR_ZONES - 1, nodemask) { | 546 | MAX_NR_ZONES - 1, nodemask) { |
499 | nid = zone_to_nid(zone); | 547 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { |
500 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && | 548 | page = dequeue_huge_page_node(h, zone_to_nid(zone)); |
501 | !list_empty(&h->hugepage_freelists[nid])) { | 549 | if (page) { |
502 | page = list_entry(h->hugepage_freelists[nid].next, | 550 | if (!avoid_reserve) |
503 | struct page, lru); | 551 | decrement_hugepage_resv_vma(h, vma); |
504 | list_del(&page->lru); | 552 | break; |
505 | h->free_huge_pages--; | 553 | } |
506 | h->free_huge_pages_node[nid]--; | ||
507 | |||
508 | if (!avoid_reserve) | ||
509 | decrement_hugepage_resv_vma(h, vma); | ||
510 | |||
511 | break; | ||
512 | } | 554 | } |
513 | } | 555 | } |
514 | err: | 556 | err: |
@@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, | |||
770 | return ret; | 812 | return ret; |
771 | } | 813 | } |
772 | 814 | ||
773 | static struct page *alloc_buddy_huge_page(struct hstate *h, | 815 | static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) |
774 | struct vm_area_struct *vma, unsigned long address) | ||
775 | { | 816 | { |
776 | struct page *page; | 817 | struct page *page; |
777 | unsigned int nid; | 818 | unsigned int r_nid; |
778 | 819 | ||
779 | if (h->order >= MAX_ORDER) | 820 | if (h->order >= MAX_ORDER) |
780 | return NULL; | 821 | return NULL; |
@@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
812 | } | 853 | } |
813 | spin_unlock(&hugetlb_lock); | 854 | spin_unlock(&hugetlb_lock); |
814 | 855 | ||
815 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| | 856 | if (nid == NUMA_NO_NODE) |
816 | __GFP_REPEAT|__GFP_NOWARN, | 857 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| |
817 | huge_page_order(h)); | 858 | __GFP_REPEAT|__GFP_NOWARN, |
859 | huge_page_order(h)); | ||
860 | else | ||
861 | page = alloc_pages_exact_node(nid, | ||
862 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | ||
863 | __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); | ||
818 | 864 | ||
819 | if (page && arch_prepare_hugepage(page)) { | 865 | if (page && arch_prepare_hugepage(page)) { |
820 | __free_pages(page, huge_page_order(h)); | 866 | __free_pages(page, huge_page_order(h)); |
@@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
823 | 869 | ||
824 | spin_lock(&hugetlb_lock); | 870 | spin_lock(&hugetlb_lock); |
825 | if (page) { | 871 | if (page) { |
826 | /* | 872 | r_nid = page_to_nid(page); |
827 | * This page is now managed by the hugetlb allocator and has | ||
828 | * no users -- drop the buddy allocator's reference. | ||
829 | */ | ||
830 | put_page_testzero(page); | ||
831 | VM_BUG_ON(page_count(page)); | ||
832 | nid = page_to_nid(page); | ||
833 | set_compound_page_dtor(page, free_huge_page); | 873 | set_compound_page_dtor(page, free_huge_page); |
834 | /* | 874 | /* |
835 | * We incremented the global counters already | 875 | * We incremented the global counters already |
836 | */ | 876 | */ |
837 | h->nr_huge_pages_node[nid]++; | 877 | h->nr_huge_pages_node[r_nid]++; |
838 | h->surplus_huge_pages_node[nid]++; | 878 | h->surplus_huge_pages_node[r_nid]++; |
839 | __count_vm_event(HTLB_BUDDY_PGALLOC); | 879 | __count_vm_event(HTLB_BUDDY_PGALLOC); |
840 | } else { | 880 | } else { |
841 | h->nr_huge_pages--; | 881 | h->nr_huge_pages--; |
@@ -848,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
848 | } | 888 | } |
849 | 889 | ||
850 | /* | 890 | /* |
891 | * This allocation function is useful in the context where vma is irrelevant. | ||
892 | * E.g. soft-offlining uses this function because it only cares physical | ||
893 | * address of error page. | ||
894 | */ | ||
895 | struct page *alloc_huge_page_node(struct hstate *h, int nid) | ||
896 | { | ||
897 | struct page *page; | ||
898 | |||
899 | spin_lock(&hugetlb_lock); | ||
900 | page = dequeue_huge_page_node(h, nid); | ||
901 | spin_unlock(&hugetlb_lock); | ||
902 | |||
903 | if (!page) | ||
904 | page = alloc_buddy_huge_page(h, nid); | ||
905 | |||
906 | return page; | ||
907 | } | ||
908 | |||
909 | /* | ||
851 | * Increase the hugetlb pool such that it can accomodate a reservation | 910 | * Increase the hugetlb pool such that it can accomodate a reservation |
852 | * of size 'delta'. | 911 | * of size 'delta'. |
853 | */ | 912 | */ |
@@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta) | |||
871 | retry: | 930 | retry: |
872 | spin_unlock(&hugetlb_lock); | 931 | spin_unlock(&hugetlb_lock); |
873 | for (i = 0; i < needed; i++) { | 932 | for (i = 0; i < needed; i++) { |
874 | page = alloc_buddy_huge_page(h, NULL, 0); | 933 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
875 | if (!page) { | 934 | if (!page) |
876 | /* | 935 | /* |
877 | * We were not able to allocate enough pages to | 936 | * We were not able to allocate enough pages to |
878 | * satisfy the entire reservation so we free what | 937 | * satisfy the entire reservation so we free what |
879 | * we've allocated so far. | 938 | * we've allocated so far. |
880 | */ | 939 | */ |
881 | spin_lock(&hugetlb_lock); | ||
882 | needed = 0; | ||
883 | goto free; | 940 | goto free; |
884 | } | ||
885 | 941 | ||
886 | list_add(&page->lru, &surplus_list); | 942 | list_add(&page->lru, &surplus_list); |
887 | } | 943 | } |
@@ -908,31 +964,31 @@ retry: | |||
908 | needed += allocated; | 964 | needed += allocated; |
909 | h->resv_huge_pages += delta; | 965 | h->resv_huge_pages += delta; |
910 | ret = 0; | 966 | ret = 0; |
911 | free: | 967 | |
968 | spin_unlock(&hugetlb_lock); | ||
912 | /* Free the needed pages to the hugetlb pool */ | 969 | /* Free the needed pages to the hugetlb pool */ |
913 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 970 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
914 | if ((--needed) < 0) | 971 | if ((--needed) < 0) |
915 | break; | 972 | break; |
916 | list_del(&page->lru); | 973 | list_del(&page->lru); |
974 | /* | ||
975 | * This page is now managed by the hugetlb allocator and has | ||
976 | * no users -- drop the buddy allocator's reference. | ||
977 | */ | ||
978 | put_page_testzero(page); | ||
979 | VM_BUG_ON(page_count(page)); | ||
917 | enqueue_huge_page(h, page); | 980 | enqueue_huge_page(h, page); |
918 | } | 981 | } |
919 | 982 | ||
920 | /* Free unnecessary surplus pages to the buddy allocator */ | 983 | /* Free unnecessary surplus pages to the buddy allocator */ |
984 | free: | ||
921 | if (!list_empty(&surplus_list)) { | 985 | if (!list_empty(&surplus_list)) { |
922 | spin_unlock(&hugetlb_lock); | ||
923 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 986 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
924 | list_del(&page->lru); | 987 | list_del(&page->lru); |
925 | /* | 988 | put_page(page); |
926 | * The page has a reference count of zero already, so | ||
927 | * call free_huge_page directly instead of using | ||
928 | * put_page. This must be done with hugetlb_lock | ||
929 | * unlocked which is safe because free_huge_page takes | ||
930 | * hugetlb_lock before deciding how to free the page. | ||
931 | */ | ||
932 | free_huge_page(page); | ||
933 | } | 989 | } |
934 | spin_lock(&hugetlb_lock); | ||
935 | } | 990 | } |
991 | spin_lock(&hugetlb_lock); | ||
936 | 992 | ||
937 | return ret; | 993 | return ret; |
938 | } | 994 | } |
@@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1052 | spin_unlock(&hugetlb_lock); | 1108 | spin_unlock(&hugetlb_lock); |
1053 | 1109 | ||
1054 | if (!page) { | 1110 | if (!page) { |
1055 | page = alloc_buddy_huge_page(h, vma, addr); | 1111 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1056 | if (!page) { | 1112 | if (!page) { |
1057 | hugetlb_put_quota(inode->i_mapping, chg); | 1113 | hugetlb_put_quota(inode->i_mapping, chg); |
1058 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1114 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1059 | } | 1115 | } |
1060 | } | 1116 | } |
1061 | 1117 | ||
1062 | set_page_refcounted(page); | ||
1063 | set_page_private(page, (unsigned long) mapping); | 1118 | set_page_private(page, (unsigned long) mapping); |
1064 | 1119 | ||
1065 | vma_commit_reservation(h, vma, addr); | 1120 | vma_commit_reservation(h, vma, addr); |
@@ -2153,6 +2208,19 @@ nomem: | |||
2153 | return -ENOMEM; | 2208 | return -ENOMEM; |
2154 | } | 2209 | } |
2155 | 2210 | ||
2211 | static int is_hugetlb_entry_migration(pte_t pte) | ||
2212 | { | ||
2213 | swp_entry_t swp; | ||
2214 | |||
2215 | if (huge_pte_none(pte) || pte_present(pte)) | ||
2216 | return 0; | ||
2217 | swp = pte_to_swp_entry(pte); | ||
2218 | if (non_swap_entry(swp) && is_migration_entry(swp)) { | ||
2219 | return 1; | ||
2220 | } else | ||
2221 | return 0; | ||
2222 | } | ||
2223 | |||
2156 | static int is_hugetlb_entry_hwpoisoned(pte_t pte) | 2224 | static int is_hugetlb_entry_hwpoisoned(pte_t pte) |
2157 | { | 2225 | { |
2158 | swp_entry_t swp; | 2226 | swp_entry_t swp; |
@@ -2383,7 +2451,7 @@ retry_avoidcopy: | |||
2383 | if (unlikely(anon_vma_prepare(vma))) | 2451 | if (unlikely(anon_vma_prepare(vma))) |
2384 | return VM_FAULT_OOM; | 2452 | return VM_FAULT_OOM; |
2385 | 2453 | ||
2386 | copy_huge_page(new_page, old_page, address, vma); | 2454 | copy_user_huge_page(new_page, old_page, address, vma); |
2387 | __SetPageUptodate(new_page); | 2455 | __SetPageUptodate(new_page); |
2388 | 2456 | ||
2389 | /* | 2457 | /* |
@@ -2515,22 +2583,20 @@ retry: | |||
2515 | hugepage_add_new_anon_rmap(page, vma, address); | 2583 | hugepage_add_new_anon_rmap(page, vma, address); |
2516 | } | 2584 | } |
2517 | } else { | 2585 | } else { |
2586 | /* | ||
2587 | * If memory error occurs between mmap() and fault, some process | ||
2588 | * don't have hwpoisoned swap entry for errored virtual address. | ||
2589 | * So we need to block hugepage fault by PG_hwpoison bit check. | ||
2590 | */ | ||
2591 | if (unlikely(PageHWPoison(page))) { | ||
2592 | ret = VM_FAULT_HWPOISON | | ||
2593 | VM_FAULT_SET_HINDEX(h - hstates); | ||
2594 | goto backout_unlocked; | ||
2595 | } | ||
2518 | page_dup_rmap(page); | 2596 | page_dup_rmap(page); |
2519 | } | 2597 | } |
2520 | 2598 | ||
2521 | /* | 2599 | /* |
2522 | * Since memory error handler replaces pte into hwpoison swap entry | ||
2523 | * at the time of error handling, a process which reserved but not have | ||
2524 | * the mapping to the error hugepage does not have hwpoison swap entry. | ||
2525 | * So we need to block accesses from such a process by checking | ||
2526 | * PG_hwpoison bit here. | ||
2527 | */ | ||
2528 | if (unlikely(PageHWPoison(page))) { | ||
2529 | ret = VM_FAULT_HWPOISON; | ||
2530 | goto backout_unlocked; | ||
2531 | } | ||
2532 | |||
2533 | /* | ||
2534 | * If we are going to COW a private mapping later, we examine the | 2600 | * If we are going to COW a private mapping later, we examine the |
2535 | * pending reservations for this page now. This will ensure that | 2601 | * pending reservations for this page now. This will ensure that |
2536 | * any allocations necessary to record that reservation occur outside | 2602 | * any allocations necessary to record that reservation occur outside |
@@ -2587,8 +2653,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2587 | ptep = huge_pte_offset(mm, address); | 2653 | ptep = huge_pte_offset(mm, address); |
2588 | if (ptep) { | 2654 | if (ptep) { |
2589 | entry = huge_ptep_get(ptep); | 2655 | entry = huge_ptep_get(ptep); |
2590 | if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 2656 | if (unlikely(is_hugetlb_entry_migration(entry))) { |
2591 | return VM_FAULT_HWPOISON; | 2657 | migration_entry_wait(mm, (pmd_t *)ptep, address); |
2658 | return 0; | ||
2659 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | ||
2660 | return VM_FAULT_HWPOISON_LARGE | | ||
2661 | VM_FAULT_SET_HINDEX(h - hstates); | ||
2592 | } | 2662 | } |
2593 | 2663 | ||
2594 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); | 2664 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
@@ -2878,18 +2948,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
2878 | hugetlb_acct_memory(h, -(chg - freed)); | 2948 | hugetlb_acct_memory(h, -(chg - freed)); |
2879 | } | 2949 | } |
2880 | 2950 | ||
2951 | #ifdef CONFIG_MEMORY_FAILURE | ||
2952 | |||
2953 | /* Should be called in hugetlb_lock */ | ||
2954 | static int is_hugepage_on_freelist(struct page *hpage) | ||
2955 | { | ||
2956 | struct page *page; | ||
2957 | struct page *tmp; | ||
2958 | struct hstate *h = page_hstate(hpage); | ||
2959 | int nid = page_to_nid(hpage); | ||
2960 | |||
2961 | list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) | ||
2962 | if (page == hpage) | ||
2963 | return 1; | ||
2964 | return 0; | ||
2965 | } | ||
2966 | |||
2881 | /* | 2967 | /* |
2882 | * This function is called from memory failure code. | 2968 | * This function is called from memory failure code. |
2883 | * Assume the caller holds page lock of the head page. | 2969 | * Assume the caller holds page lock of the head page. |
2884 | */ | 2970 | */ |
2885 | void __isolate_hwpoisoned_huge_page(struct page *hpage) | 2971 | int dequeue_hwpoisoned_huge_page(struct page *hpage) |
2886 | { | 2972 | { |
2887 | struct hstate *h = page_hstate(hpage); | 2973 | struct hstate *h = page_hstate(hpage); |
2888 | int nid = page_to_nid(hpage); | 2974 | int nid = page_to_nid(hpage); |
2975 | int ret = -EBUSY; | ||
2889 | 2976 | ||
2890 | spin_lock(&hugetlb_lock); | 2977 | spin_lock(&hugetlb_lock); |
2891 | list_del(&hpage->lru); | 2978 | if (is_hugepage_on_freelist(hpage)) { |
2892 | h->free_huge_pages--; | 2979 | list_del(&hpage->lru); |
2893 | h->free_huge_pages_node[nid]--; | 2980 | set_page_refcounted(hpage); |
2981 | h->free_huge_pages--; | ||
2982 | h->free_huge_pages_node[nid]--; | ||
2983 | ret = 0; | ||
2984 | } | ||
2894 | spin_unlock(&hugetlb_lock); | 2985 | spin_unlock(&hugetlb_lock); |
2986 | return ret; | ||
2895 | } | 2987 | } |
2988 | #endif | ||