summaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c234
1 files changed, 165 insertions, 69 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8874c8ad55aa..271e4432734c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock);
61static int num_fault_mutexes; 61static int num_fault_mutexes;
62static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; 62static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
63 63
64/* Forward declaration */
65static int hugetlb_acct_memory(struct hstate *h, long delta);
66
64static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 67static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
65{ 68{
66 bool free = (spool->count == 0) && (spool->used_hpages == 0); 69 bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
68 spin_unlock(&spool->lock); 71 spin_unlock(&spool->lock);
69 72
70 /* If no pages are used, and no other handles to the subpool 73 /* If no pages are used, and no other handles to the subpool
71 * remain, free the subpool the subpool remain */ 74 * remain, give up any reservations mased on minimum size and
72 if (free) 75 * free the subpool */
76 if (free) {
77 if (spool->min_hpages != -1)
78 hugetlb_acct_memory(spool->hstate,
79 -spool->min_hpages);
73 kfree(spool); 80 kfree(spool);
81 }
74} 82}
75 83
76struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) 84struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
85 long min_hpages)
77{ 86{
78 struct hugepage_subpool *spool; 87 struct hugepage_subpool *spool;
79 88
80 spool = kmalloc(sizeof(*spool), GFP_KERNEL); 89 spool = kzalloc(sizeof(*spool), GFP_KERNEL);
81 if (!spool) 90 if (!spool)
82 return NULL; 91 return NULL;
83 92
84 spin_lock_init(&spool->lock); 93 spin_lock_init(&spool->lock);
85 spool->count = 1; 94 spool->count = 1;
86 spool->max_hpages = nr_blocks; 95 spool->max_hpages = max_hpages;
87 spool->used_hpages = 0; 96 spool->hstate = h;
97 spool->min_hpages = min_hpages;
98
99 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
100 kfree(spool);
101 return NULL;
102 }
103 spool->rsv_hpages = min_hpages;
88 104
89 return spool; 105 return spool;
90} 106}
@@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
97 unlock_or_release_subpool(spool); 113 unlock_or_release_subpool(spool);
98} 114}
99 115
100static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, 116/*
117 * Subpool accounting for allocating and reserving pages.
118 * Return -ENOMEM if there are not enough resources to satisfy the
119 * the request. Otherwise, return the number of pages by which the
120 * global pools must be adjusted (upward). The returned value may
121 * only be different than the passed value (delta) in the case where
122 * a subpool minimum size must be manitained.
123 */
124static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
101 long delta) 125 long delta)
102{ 126{
103 int ret = 0; 127 long ret = delta;
104 128
105 if (!spool) 129 if (!spool)
106 return 0; 130 return ret;
107 131
108 spin_lock(&spool->lock); 132 spin_lock(&spool->lock);
109 if ((spool->used_hpages + delta) <= spool->max_hpages) { 133
110 spool->used_hpages += delta; 134 if (spool->max_hpages != -1) { /* maximum size accounting */
111 } else { 135 if ((spool->used_hpages + delta) <= spool->max_hpages)
112 ret = -ENOMEM; 136 spool->used_hpages += delta;
137 else {
138 ret = -ENOMEM;
139 goto unlock_ret;
140 }
141 }
142
143 if (spool->min_hpages != -1) { /* minimum size accounting */
144 if (delta > spool->rsv_hpages) {
145 /*
146 * Asking for more reserves than those already taken on
147 * behalf of subpool. Return difference.
148 */
149 ret = delta - spool->rsv_hpages;
150 spool->rsv_hpages = 0;
151 } else {
152 ret = 0; /* reserves already accounted for */
153 spool->rsv_hpages -= delta;
154 }
113 } 155 }
114 spin_unlock(&spool->lock);
115 156
157unlock_ret:
158 spin_unlock(&spool->lock);
116 return ret; 159 return ret;
117} 160}
118 161
119static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, 162/*
163 * Subpool accounting for freeing and unreserving pages.
164 * Return the number of global page reservations that must be dropped.
165 * The return value may only be different than the passed value (delta)
166 * in the case where a subpool minimum size must be maintained.
167 */
168static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
120 long delta) 169 long delta)
121{ 170{
171 long ret = delta;
172
122 if (!spool) 173 if (!spool)
123 return; 174 return delta;
124 175
125 spin_lock(&spool->lock); 176 spin_lock(&spool->lock);
126 spool->used_hpages -= delta; 177
127 /* If hugetlbfs_put_super couldn't free spool due to 178 if (spool->max_hpages != -1) /* maximum size accounting */
128 * an outstanding quota reference, free it now. */ 179 spool->used_hpages -= delta;
180
181 if (spool->min_hpages != -1) { /* minimum size accounting */
182 if (spool->rsv_hpages + delta <= spool->min_hpages)
183 ret = 0;
184 else
185 ret = spool->rsv_hpages + delta - spool->min_hpages;
186
187 spool->rsv_hpages += delta;
188 if (spool->rsv_hpages > spool->min_hpages)
189 spool->rsv_hpages = spool->min_hpages;
190 }
191
192 /*
193 * If hugetlbfs_put_super couldn't free spool due to an outstanding
194 * quota reference, free it now.
195 */
129 unlock_or_release_subpool(spool); 196 unlock_or_release_subpool(spool);
197
198 return ret;
130} 199}
131 200
132static inline struct hugepage_subpool *subpool_inode(struct inode *inode) 201static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
@@ -855,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size)
855 return NULL; 924 return NULL;
856} 925}
857 926
927/*
928 * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
929 * to hstate->hugepage_activelist.)
930 *
931 * This function can be called for tail pages, but never returns true for them.
932 */
933bool page_huge_active(struct page *page)
934{
935 VM_BUG_ON_PAGE(!PageHuge(page), page);
936 return PageHead(page) && PagePrivate(&page[1]);
937}
938
939/* never called for tail page */
940static void set_page_huge_active(struct page *page)
941{
942 VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
943 SetPagePrivate(&page[1]);
944}
945
946static void clear_page_huge_active(struct page *page)
947{
948 VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
949 ClearPagePrivate(&page[1]);
950}
951
858void free_huge_page(struct page *page) 952void free_huge_page(struct page *page)
859{ 953{
860 /* 954 /*
@@ -874,7 +968,16 @@ void free_huge_page(struct page *page)
874 restore_reserve = PagePrivate(page); 968 restore_reserve = PagePrivate(page);
875 ClearPagePrivate(page); 969 ClearPagePrivate(page);
876 970
971 /*
972 * A return code of zero implies that the subpool will be under its
973 * minimum size if the reservation is not restored after page is free.
974 * Therefore, force restore_reserve operation.
975 */
976 if (hugepage_subpool_put_pages(spool, 1) == 0)
977 restore_reserve = true;
978
877 spin_lock(&hugetlb_lock); 979 spin_lock(&hugetlb_lock);
980 clear_page_huge_active(page);
878 hugetlb_cgroup_uncharge_page(hstate_index(h), 981 hugetlb_cgroup_uncharge_page(hstate_index(h),
879 pages_per_huge_page(h), page); 982 pages_per_huge_page(h), page);
880 if (restore_reserve) 983 if (restore_reserve)
@@ -891,7 +994,6 @@ void free_huge_page(struct page *page)
891 enqueue_huge_page(h, page); 994 enqueue_huge_page(h, page);
892 } 995 }
893 spin_unlock(&hugetlb_lock); 996 spin_unlock(&hugetlb_lock);
894 hugepage_subpool_put_pages(spool, 1);
895} 997}
896 998
897static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 999static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -1386,7 +1488,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1386 if (chg < 0) 1488 if (chg < 0)
1387 return ERR_PTR(-ENOMEM); 1489 return ERR_PTR(-ENOMEM);
1388 if (chg || avoid_reserve) 1490 if (chg || avoid_reserve)
1389 if (hugepage_subpool_get_pages(spool, 1)) 1491 if (hugepage_subpool_get_pages(spool, 1) < 0)
1390 return ERR_PTR(-ENOSPC); 1492 return ERR_PTR(-ENOSPC);
1391 1493
1392 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 1494 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
@@ -2454,6 +2556,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2454 struct resv_map *resv = vma_resv_map(vma); 2556 struct resv_map *resv = vma_resv_map(vma);
2455 struct hugepage_subpool *spool = subpool_vma(vma); 2557 struct hugepage_subpool *spool = subpool_vma(vma);
2456 unsigned long reserve, start, end; 2558 unsigned long reserve, start, end;
2559 long gbl_reserve;
2457 2560
2458 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 2561 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
2459 return; 2562 return;
@@ -2466,8 +2569,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2466 kref_put(&resv->refs, resv_map_release); 2569 kref_put(&resv->refs, resv_map_release);
2467 2570
2468 if (reserve) { 2571 if (reserve) {
2469 hugetlb_acct_memory(h, -reserve); 2572 /*
2470 hugepage_subpool_put_pages(spool, reserve); 2573 * Decrement reserve counts. The global reserve count may be
2574 * adjusted if the subpool has a minimum size.
2575 */
2576 gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
2577 hugetlb_acct_memory(h, -gbl_reserve);
2471 } 2578 }
2472} 2579}
2473 2580
@@ -2891,6 +2998,7 @@ retry_avoidcopy:
2891 copy_user_huge_page(new_page, old_page, address, vma, 2998 copy_user_huge_page(new_page, old_page, address, vma,
2892 pages_per_huge_page(h)); 2999 pages_per_huge_page(h));
2893 __SetPageUptodate(new_page); 3000 __SetPageUptodate(new_page);
3001 set_page_huge_active(new_page);
2894 3002
2895 mmun_start = address & huge_page_mask(h); 3003 mmun_start = address & huge_page_mask(h);
2896 mmun_end = mmun_start + huge_page_size(h); 3004 mmun_end = mmun_start + huge_page_size(h);
@@ -3003,6 +3111,7 @@ retry:
3003 } 3111 }
3004 clear_huge_page(page, address, pages_per_huge_page(h)); 3112 clear_huge_page(page, address, pages_per_huge_page(h));
3005 __SetPageUptodate(page); 3113 __SetPageUptodate(page);
3114 set_page_huge_active(page);
3006 3115
3007 if (vma->vm_flags & VM_MAYSHARE) { 3116 if (vma->vm_flags & VM_MAYSHARE) {
3008 int err; 3117 int err;
@@ -3447,6 +3556,7 @@ int hugetlb_reserve_pages(struct inode *inode,
3447 struct hstate *h = hstate_inode(inode); 3556 struct hstate *h = hstate_inode(inode);
3448 struct hugepage_subpool *spool = subpool_inode(inode); 3557 struct hugepage_subpool *spool = subpool_inode(inode);
3449 struct resv_map *resv_map; 3558 struct resv_map *resv_map;
3559 long gbl_reserve;
3450 3560
3451 /* 3561 /*
3452 * Only apply hugepage reservation if asked. At fault time, an 3562 * Only apply hugepage reservation if asked. At fault time, an
@@ -3483,8 +3593,13 @@ int hugetlb_reserve_pages(struct inode *inode,
3483 goto out_err; 3593 goto out_err;
3484 } 3594 }
3485 3595
3486 /* There must be enough pages in the subpool for the mapping */ 3596 /*
3487 if (hugepage_subpool_get_pages(spool, chg)) { 3597 * There must be enough pages in the subpool for the mapping. If
3598 * the subpool has a minimum size, there may be some global
3599 * reservations already in place (gbl_reserve).
3600 */
3601 gbl_reserve = hugepage_subpool_get_pages(spool, chg);
3602 if (gbl_reserve < 0) {
3488 ret = -ENOSPC; 3603 ret = -ENOSPC;
3489 goto out_err; 3604 goto out_err;
3490 } 3605 }
@@ -3493,9 +3608,10 @@ int hugetlb_reserve_pages(struct inode *inode,
3493 * Check enough hugepages are available for the reservation. 3608 * Check enough hugepages are available for the reservation.
3494 * Hand the pages back to the subpool if there are not 3609 * Hand the pages back to the subpool if there are not
3495 */ 3610 */
3496 ret = hugetlb_acct_memory(h, chg); 3611 ret = hugetlb_acct_memory(h, gbl_reserve);
3497 if (ret < 0) { 3612 if (ret < 0) {
3498 hugepage_subpool_put_pages(spool, chg); 3613 /* put back original number of pages, chg */
3614 (void)hugepage_subpool_put_pages(spool, chg);
3499 goto out_err; 3615 goto out_err;
3500 } 3616 }
3501 3617
@@ -3525,6 +3641,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3525 struct resv_map *resv_map = inode_resv_map(inode); 3641 struct resv_map *resv_map = inode_resv_map(inode);
3526 long chg = 0; 3642 long chg = 0;
3527 struct hugepage_subpool *spool = subpool_inode(inode); 3643 struct hugepage_subpool *spool = subpool_inode(inode);
3644 long gbl_reserve;
3528 3645
3529 if (resv_map) 3646 if (resv_map)
3530 chg = region_truncate(resv_map, offset); 3647 chg = region_truncate(resv_map, offset);
@@ -3532,8 +3649,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3532 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 3649 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
3533 spin_unlock(&inode->i_lock); 3650 spin_unlock(&inode->i_lock);
3534 3651
3535 hugepage_subpool_put_pages(spool, (chg - freed)); 3652 /*
3536 hugetlb_acct_memory(h, -(chg - freed)); 3653 * If the subpool has a minimum size, the number of global
3654 * reservations to be released may be adjusted.
3655 */
3656 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
3657 hugetlb_acct_memory(h, -gbl_reserve);
3537} 3658}
3538 3659
3539#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 3660#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3775,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
3775 3896
3776#ifdef CONFIG_MEMORY_FAILURE 3897#ifdef CONFIG_MEMORY_FAILURE
3777 3898
3778/* Should be called in hugetlb_lock */
3779static int is_hugepage_on_freelist(struct page *hpage)
3780{
3781 struct page *page;
3782 struct page *tmp;
3783 struct hstate *h = page_hstate(hpage);
3784 int nid = page_to_nid(hpage);
3785
3786 list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
3787 if (page == hpage)
3788 return 1;
3789 return 0;
3790}
3791
3792/* 3899/*
3793 * This function is called from memory failure code. 3900 * This function is called from memory failure code.
3794 * Assume the caller holds page lock of the head page. 3901 * Assume the caller holds page lock of the head page.
@@ -3800,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3800 int ret = -EBUSY; 3907 int ret = -EBUSY;
3801 3908
3802 spin_lock(&hugetlb_lock); 3909 spin_lock(&hugetlb_lock);
3803 if (is_hugepage_on_freelist(hpage)) { 3910 /*
3911 * Just checking !page_huge_active is not enough, because that could be
3912 * an isolated/hwpoisoned hugepage (which have >0 refcount).
3913 */
3914 if (!page_huge_active(hpage) && !page_count(hpage)) {
3804 /* 3915 /*
3805 * Hwpoisoned hugepage isn't linked to activelist or freelist, 3916 * Hwpoisoned hugepage isn't linked to activelist or freelist,
3806 * but dangling hpage->lru can trigger list-debug warnings 3917 * but dangling hpage->lru can trigger list-debug warnings
@@ -3820,42 +3931,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3820 3931
3821bool isolate_huge_page(struct page *page, struct list_head *list) 3932bool isolate_huge_page(struct page *page, struct list_head *list)
3822{ 3933{
3934 bool ret = true;
3935
3823 VM_BUG_ON_PAGE(!PageHead(page), page); 3936 VM_BUG_ON_PAGE(!PageHead(page), page);
3824 if (!get_page_unless_zero(page))
3825 return false;
3826 spin_lock(&hugetlb_lock); 3937 spin_lock(&hugetlb_lock);
3938 if (!page_huge_active(page) || !get_page_unless_zero(page)) {
3939 ret = false;
3940 goto unlock;
3941 }
3942 clear_page_huge_active(page);
3827 list_move_tail(&page->lru, list); 3943 list_move_tail(&page->lru, list);
3944unlock:
3828 spin_unlock(&hugetlb_lock); 3945 spin_unlock(&hugetlb_lock);
3829 return true; 3946 return ret;
3830} 3947}
3831 3948
3832void putback_active_hugepage(struct page *page) 3949void putback_active_hugepage(struct page *page)
3833{ 3950{
3834 VM_BUG_ON_PAGE(!PageHead(page), page); 3951 VM_BUG_ON_PAGE(!PageHead(page), page);
3835 spin_lock(&hugetlb_lock); 3952 spin_lock(&hugetlb_lock);
3953 set_page_huge_active(page);
3836 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); 3954 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
3837 spin_unlock(&hugetlb_lock); 3955 spin_unlock(&hugetlb_lock);
3838 put_page(page); 3956 put_page(page);
3839} 3957}
3840
3841bool is_hugepage_active(struct page *page)
3842{
3843 VM_BUG_ON_PAGE(!PageHuge(page), page);
3844 /*
3845 * This function can be called for a tail page because the caller,
3846 * scan_movable_pages, scans through a given pfn-range which typically
3847 * covers one memory block. In systems using gigantic hugepage (1GB
3848 * for x86_64,) a hugepage is larger than a memory block, and we don't
3849 * support migrating such large hugepages for now, so return false
3850 * when called for tail pages.
3851 */
3852 if (PageTail(page))
3853 return false;
3854 /*
3855 * Refcount of a hwpoisoned hugepages is 1, but they are not active,
3856 * so we should return false for them.
3857 */
3858 if (unlikely(PageHWPoison(page)))
3859 return false;
3860 return page_count(page) > 0;
3861}