aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c233
1 files changed, 163 insertions, 70 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c03273807182..96991ded82fe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page,
423 } 423 }
424} 424}
425 425
426static void copy_gigantic_page(struct page *dst, struct page *src, 426static void copy_user_gigantic_page(struct page *dst, struct page *src,
427 unsigned long addr, struct vm_area_struct *vma) 427 unsigned long addr, struct vm_area_struct *vma)
428{ 428{
429 int i; 429 int i;
430 struct hstate *h = hstate_vma(vma); 430 struct hstate *h = hstate_vma(vma);
431 struct page *dst_base = dst; 431 struct page *dst_base = dst;
432 struct page *src_base = src; 432 struct page *src_base = src;
433 might_sleep(); 433
434 for (i = 0; i < pages_per_huge_page(h); ) { 434 for (i = 0; i < pages_per_huge_page(h); ) {
435 cond_resched(); 435 cond_resched();
436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); 436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src,
440 src = mem_map_next(src, src_base, i); 440 src = mem_map_next(src, src_base, i);
441 } 441 }
442} 442}
443static void copy_huge_page(struct page *dst, struct page *src, 443
444static void copy_user_huge_page(struct page *dst, struct page *src,
444 unsigned long addr, struct vm_area_struct *vma) 445 unsigned long addr, struct vm_area_struct *vma)
445{ 446{
446 int i; 447 int i;
447 struct hstate *h = hstate_vma(vma); 448 struct hstate *h = hstate_vma(vma);
448 449
449 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { 450 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
450 copy_gigantic_page(dst, src, addr, vma); 451 copy_user_gigantic_page(dst, src, addr, vma);
451 return; 452 return;
452 } 453 }
453 454
@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src,
458 } 459 }
459} 460}
460 461
462static void copy_gigantic_page(struct page *dst, struct page *src)
463{
464 int i;
465 struct hstate *h = page_hstate(src);
466 struct page *dst_base = dst;
467 struct page *src_base = src;
468
469 for (i = 0; i < pages_per_huge_page(h); ) {
470 cond_resched();
471 copy_highpage(dst, src);
472
473 i++;
474 dst = mem_map_next(dst, dst_base, i);
475 src = mem_map_next(src, src_base, i);
476 }
477}
478
479void copy_huge_page(struct page *dst, struct page *src)
480{
481 int i;
482 struct hstate *h = page_hstate(src);
483
484 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
485 copy_gigantic_page(dst, src);
486 return;
487 }
488
489 might_sleep();
490 for (i = 0; i < pages_per_huge_page(h); i++) {
491 cond_resched();
492 copy_highpage(dst + i, src + i);
493 }
494}
495
461static void enqueue_huge_page(struct hstate *h, struct page *page) 496static void enqueue_huge_page(struct hstate *h, struct page *page)
462{ 497{
463 int nid = page_to_nid(page); 498 int nid = page_to_nid(page);
@@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
466 h->free_huge_pages_node[nid]++; 501 h->free_huge_pages_node[nid]++;
467} 502}
468 503
504static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
505{
506 struct page *page;
507
508 if (list_empty(&h->hugepage_freelists[nid]))
509 return NULL;
510 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
511 list_del(&page->lru);
512 set_page_refcounted(page);
513 h->free_huge_pages--;
514 h->free_huge_pages_node[nid]--;
515 return page;
516}
517
469static struct page *dequeue_huge_page_vma(struct hstate *h, 518static struct page *dequeue_huge_page_vma(struct hstate *h,
470 struct vm_area_struct *vma, 519 struct vm_area_struct *vma,
471 unsigned long address, int avoid_reserve) 520 unsigned long address, int avoid_reserve)
472{ 521{
473 int nid;
474 struct page *page = NULL; 522 struct page *page = NULL;
475 struct mempolicy *mpol; 523 struct mempolicy *mpol;
476 nodemask_t *nodemask; 524 nodemask_t *nodemask;
@@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
496 544
497 for_each_zone_zonelist_nodemask(zone, z, zonelist, 545 for_each_zone_zonelist_nodemask(zone, z, zonelist,
498 MAX_NR_ZONES - 1, nodemask) { 546 MAX_NR_ZONES - 1, nodemask) {
499 nid = zone_to_nid(zone); 547 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
500 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 548 page = dequeue_huge_page_node(h, zone_to_nid(zone));
501 !list_empty(&h->hugepage_freelists[nid])) { 549 if (page) {
502 page = list_entry(h->hugepage_freelists[nid].next, 550 if (!avoid_reserve)
503 struct page, lru); 551 decrement_hugepage_resv_vma(h, vma);
504 list_del(&page->lru); 552 break;
505 h->free_huge_pages--; 553 }
506 h->free_huge_pages_node[nid]--;
507
508 if (!avoid_reserve)
509 decrement_hugepage_resv_vma(h, vma);
510
511 break;
512 } 554 }
513 } 555 }
514err: 556err:
@@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
770 return ret; 812 return ret;
771} 813}
772 814
773static struct page *alloc_buddy_huge_page(struct hstate *h, 815static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
774 struct vm_area_struct *vma, unsigned long address)
775{ 816{
776 struct page *page; 817 struct page *page;
777 unsigned int nid; 818 unsigned int r_nid;
778 819
779 if (h->order >= MAX_ORDER) 820 if (h->order >= MAX_ORDER)
780 return NULL; 821 return NULL;
@@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
812 } 853 }
813 spin_unlock(&hugetlb_lock); 854 spin_unlock(&hugetlb_lock);
814 855
815 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 856 if (nid == NUMA_NO_NODE)
816 __GFP_REPEAT|__GFP_NOWARN, 857 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
817 huge_page_order(h)); 858 __GFP_REPEAT|__GFP_NOWARN,
859 huge_page_order(h));
860 else
861 page = alloc_pages_exact_node(nid,
862 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
863 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
818 864
819 if (page && arch_prepare_hugepage(page)) { 865 if (page && arch_prepare_hugepage(page)) {
820 __free_pages(page, huge_page_order(h)); 866 __free_pages(page, huge_page_order(h));
@@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
823 869
824 spin_lock(&hugetlb_lock); 870 spin_lock(&hugetlb_lock);
825 if (page) { 871 if (page) {
826 /* 872 r_nid = page_to_nid(page);
827 * This page is now managed by the hugetlb allocator and has
828 * no users -- drop the buddy allocator's reference.
829 */
830 put_page_testzero(page);
831 VM_BUG_ON(page_count(page));
832 nid = page_to_nid(page);
833 set_compound_page_dtor(page, free_huge_page); 873 set_compound_page_dtor(page, free_huge_page);
834 /* 874 /*
835 * We incremented the global counters already 875 * We incremented the global counters already
836 */ 876 */
837 h->nr_huge_pages_node[nid]++; 877 h->nr_huge_pages_node[r_nid]++;
838 h->surplus_huge_pages_node[nid]++; 878 h->surplus_huge_pages_node[r_nid]++;
839 __count_vm_event(HTLB_BUDDY_PGALLOC); 879 __count_vm_event(HTLB_BUDDY_PGALLOC);
840 } else { 880 } else {
841 h->nr_huge_pages--; 881 h->nr_huge_pages--;
@@ -848,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
848} 888}
849 889
850/* 890/*
891 * This allocation function is useful in the context where vma is irrelevant.
892 * E.g. soft-offlining uses this function because it only cares physical
893 * address of error page.
894 */
895struct page *alloc_huge_page_node(struct hstate *h, int nid)
896{
897 struct page *page;
898
899 spin_lock(&hugetlb_lock);
900 page = dequeue_huge_page_node(h, nid);
901 spin_unlock(&hugetlb_lock);
902
903 if (!page)
904 page = alloc_buddy_huge_page(h, nid);
905
906 return page;
907}
908
909/*
851 * Increase the hugetlb pool such that it can accomodate a reservation 910 * Increase the hugetlb pool such that it can accomodate a reservation
852 * of size 'delta'. 911 * of size 'delta'.
853 */ 912 */
@@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
871retry: 930retry:
872 spin_unlock(&hugetlb_lock); 931 spin_unlock(&hugetlb_lock);
873 for (i = 0; i < needed; i++) { 932 for (i = 0; i < needed; i++) {
874 page = alloc_buddy_huge_page(h, NULL, 0); 933 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
875 if (!page) { 934 if (!page)
876 /* 935 /*
877 * We were not able to allocate enough pages to 936 * We were not able to allocate enough pages to
878 * satisfy the entire reservation so we free what 937 * satisfy the entire reservation so we free what
879 * we've allocated so far. 938 * we've allocated so far.
880 */ 939 */
881 spin_lock(&hugetlb_lock);
882 needed = 0;
883 goto free; 940 goto free;
884 }
885 941
886 list_add(&page->lru, &surplus_list); 942 list_add(&page->lru, &surplus_list);
887 } 943 }
@@ -908,31 +964,31 @@ retry:
908 needed += allocated; 964 needed += allocated;
909 h->resv_huge_pages += delta; 965 h->resv_huge_pages += delta;
910 ret = 0; 966 ret = 0;
911free: 967
968 spin_unlock(&hugetlb_lock);
912 /* Free the needed pages to the hugetlb pool */ 969 /* Free the needed pages to the hugetlb pool */
913 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 970 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
914 if ((--needed) < 0) 971 if ((--needed) < 0)
915 break; 972 break;
916 list_del(&page->lru); 973 list_del(&page->lru);
974 /*
975 * This page is now managed by the hugetlb allocator and has
976 * no users -- drop the buddy allocator's reference.
977 */
978 put_page_testzero(page);
979 VM_BUG_ON(page_count(page));
917 enqueue_huge_page(h, page); 980 enqueue_huge_page(h, page);
918 } 981 }
919 982
920 /* Free unnecessary surplus pages to the buddy allocator */ 983 /* Free unnecessary surplus pages to the buddy allocator */
984free:
921 if (!list_empty(&surplus_list)) { 985 if (!list_empty(&surplus_list)) {
922 spin_unlock(&hugetlb_lock);
923 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 986 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
924 list_del(&page->lru); 987 list_del(&page->lru);
925 /* 988 put_page(page);
926 * The page has a reference count of zero already, so
927 * call free_huge_page directly instead of using
928 * put_page. This must be done with hugetlb_lock
929 * unlocked which is safe because free_huge_page takes
930 * hugetlb_lock before deciding how to free the page.
931 */
932 free_huge_page(page);
933 } 989 }
934 spin_lock(&hugetlb_lock);
935 } 990 }
991 spin_lock(&hugetlb_lock);
936 992
937 return ret; 993 return ret;
938} 994}
@@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1052 spin_unlock(&hugetlb_lock); 1108 spin_unlock(&hugetlb_lock);
1053 1109
1054 if (!page) { 1110 if (!page) {
1055 page = alloc_buddy_huge_page(h, vma, addr); 1111 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1056 if (!page) { 1112 if (!page) {
1057 hugetlb_put_quota(inode->i_mapping, chg); 1113 hugetlb_put_quota(inode->i_mapping, chg);
1058 return ERR_PTR(-VM_FAULT_SIGBUS); 1114 return ERR_PTR(-VM_FAULT_SIGBUS);
1059 } 1115 }
1060 } 1116 }
1061 1117
1062 set_page_refcounted(page);
1063 set_page_private(page, (unsigned long) mapping); 1118 set_page_private(page, (unsigned long) mapping);
1064 1119
1065 vma_commit_reservation(h, vma, addr); 1120 vma_commit_reservation(h, vma, addr);
@@ -2153,6 +2208,19 @@ nomem:
2153 return -ENOMEM; 2208 return -ENOMEM;
2154} 2209}
2155 2210
2211static int is_hugetlb_entry_migration(pte_t pte)
2212{
2213 swp_entry_t swp;
2214
2215 if (huge_pte_none(pte) || pte_present(pte))
2216 return 0;
2217 swp = pte_to_swp_entry(pte);
2218 if (non_swap_entry(swp) && is_migration_entry(swp)) {
2219 return 1;
2220 } else
2221 return 0;
2222}
2223
2156static int is_hugetlb_entry_hwpoisoned(pte_t pte) 2224static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2157{ 2225{
2158 swp_entry_t swp; 2226 swp_entry_t swp;
@@ -2383,7 +2451,7 @@ retry_avoidcopy:
2383 if (unlikely(anon_vma_prepare(vma))) 2451 if (unlikely(anon_vma_prepare(vma)))
2384 return VM_FAULT_OOM; 2452 return VM_FAULT_OOM;
2385 2453
2386 copy_huge_page(new_page, old_page, address, vma); 2454 copy_user_huge_page(new_page, old_page, address, vma);
2387 __SetPageUptodate(new_page); 2455 __SetPageUptodate(new_page);
2388 2456
2389 /* 2457 /*
@@ -2515,22 +2583,20 @@ retry:
2515 hugepage_add_new_anon_rmap(page, vma, address); 2583 hugepage_add_new_anon_rmap(page, vma, address);
2516 } 2584 }
2517 } else { 2585 } else {
2586 /*
2587 * If memory error occurs between mmap() and fault, some process
2588 * don't have hwpoisoned swap entry for errored virtual address.
2589 * So we need to block hugepage fault by PG_hwpoison bit check.
2590 */
2591 if (unlikely(PageHWPoison(page))) {
2592 ret = VM_FAULT_HWPOISON |
2593 VM_FAULT_SET_HINDEX(h - hstates);
2594 goto backout_unlocked;
2595 }
2518 page_dup_rmap(page); 2596 page_dup_rmap(page);
2519 } 2597 }
2520 2598
2521 /* 2599 /*
2522 * Since memory error handler replaces pte into hwpoison swap entry
2523 * at the time of error handling, a process which reserved but not have
2524 * the mapping to the error hugepage does not have hwpoison swap entry.
2525 * So we need to block accesses from such a process by checking
2526 * PG_hwpoison bit here.
2527 */
2528 if (unlikely(PageHWPoison(page))) {
2529 ret = VM_FAULT_HWPOISON;
2530 goto backout_unlocked;
2531 }
2532
2533 /*
2534 * If we are going to COW a private mapping later, we examine the 2600 * If we are going to COW a private mapping later, we examine the
2535 * pending reservations for this page now. This will ensure that 2601 * pending reservations for this page now. This will ensure that
2536 * any allocations necessary to record that reservation occur outside 2602 * any allocations necessary to record that reservation occur outside
@@ -2587,8 +2653,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2587 ptep = huge_pte_offset(mm, address); 2653 ptep = huge_pte_offset(mm, address);
2588 if (ptep) { 2654 if (ptep) {
2589 entry = huge_ptep_get(ptep); 2655 entry = huge_ptep_get(ptep);
2590 if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2656 if (unlikely(is_hugetlb_entry_migration(entry))) {
2591 return VM_FAULT_HWPOISON; 2657 migration_entry_wait(mm, (pmd_t *)ptep, address);
2658 return 0;
2659 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2660 return VM_FAULT_HWPOISON_LARGE |
2661 VM_FAULT_SET_HINDEX(h - hstates);
2592 } 2662 }
2593 2663
2594 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2664 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2878,18 +2948,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2878 hugetlb_acct_memory(h, -(chg - freed)); 2948 hugetlb_acct_memory(h, -(chg - freed));
2879} 2949}
2880 2950
2951#ifdef CONFIG_MEMORY_FAILURE
2952
2953/* Should be called in hugetlb_lock */
2954static int is_hugepage_on_freelist(struct page *hpage)
2955{
2956 struct page *page;
2957 struct page *tmp;
2958 struct hstate *h = page_hstate(hpage);
2959 int nid = page_to_nid(hpage);
2960
2961 list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
2962 if (page == hpage)
2963 return 1;
2964 return 0;
2965}
2966
2881/* 2967/*
2882 * This function is called from memory failure code. 2968 * This function is called from memory failure code.
2883 * Assume the caller holds page lock of the head page. 2969 * Assume the caller holds page lock of the head page.
2884 */ 2970 */
2885void __isolate_hwpoisoned_huge_page(struct page *hpage) 2971int dequeue_hwpoisoned_huge_page(struct page *hpage)
2886{ 2972{
2887 struct hstate *h = page_hstate(hpage); 2973 struct hstate *h = page_hstate(hpage);
2888 int nid = page_to_nid(hpage); 2974 int nid = page_to_nid(hpage);
2975 int ret = -EBUSY;
2889 2976
2890 spin_lock(&hugetlb_lock); 2977 spin_lock(&hugetlb_lock);
2891 list_del(&hpage->lru); 2978 if (is_hugepage_on_freelist(hpage)) {
2892 h->free_huge_pages--; 2979 list_del(&hpage->lru);
2893 h->free_huge_pages_node[nid]--; 2980 set_page_refcounted(hpage);
2981 h->free_huge_pages--;
2982 h->free_huge_pages_node[nid]--;
2983 ret = 0;
2984 }
2894 spin_unlock(&hugetlb_lock); 2985 spin_unlock(&hugetlb_lock);
2986 return ret;
2895} 2987}
2988#endif