aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c398
1 files changed, 326 insertions, 72 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eab8c428cc93..ae2959bb59cb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -23,12 +23,16 @@
23 23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26static unsigned long surplus_huge_pages;
26unsigned long max_huge_pages; 27unsigned long max_huge_pages;
27static struct list_head hugepage_freelists[MAX_NUMNODES]; 28static struct list_head hugepage_freelists[MAX_NUMNODES];
28static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 29static unsigned int nr_huge_pages_node[MAX_NUMNODES];
29static unsigned int free_huge_pages_node[MAX_NUMNODES]; 30static unsigned int free_huge_pages_node[MAX_NUMNODES];
31static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
30static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 32static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
31unsigned long hugepages_treat_as_movable; 33unsigned long hugepages_treat_as_movable;
34int hugetlb_dynamic_pool;
35static int hugetlb_next_nid;
32 36
33/* 37/*
34 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 38 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -85,6 +89,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
85 list_del(&page->lru); 89 list_del(&page->lru);
86 free_huge_pages--; 90 free_huge_pages--;
87 free_huge_pages_node[nid]--; 91 free_huge_pages_node[nid]--;
92 if (vma && vma->vm_flags & VM_MAYSHARE)
93 resv_huge_pages--;
88 break; 94 break;
89 } 95 }
90 } 96 }
@@ -92,58 +98,269 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
92 return page; 98 return page;
93} 99}
94 100
101static void update_and_free_page(struct page *page)
102{
103 int i;
104 nr_huge_pages--;
105 nr_huge_pages_node[page_to_nid(page)]--;
106 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
107 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
108 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
109 1 << PG_private | 1<< PG_writeback);
110 }
111 set_compound_page_dtor(page, NULL);
112 set_page_refcounted(page);
113 __free_pages(page, HUGETLB_PAGE_ORDER);
114}
115
95static void free_huge_page(struct page *page) 116static void free_huge_page(struct page *page)
96{ 117{
97 BUG_ON(page_count(page)); 118 int nid = page_to_nid(page);
98 119
120 BUG_ON(page_count(page));
99 INIT_LIST_HEAD(&page->lru); 121 INIT_LIST_HEAD(&page->lru);
100 122
101 spin_lock(&hugetlb_lock); 123 spin_lock(&hugetlb_lock);
102 enqueue_huge_page(page); 124 if (surplus_huge_pages_node[nid]) {
125 update_and_free_page(page);
126 surplus_huge_pages--;
127 surplus_huge_pages_node[nid]--;
128 } else {
129 enqueue_huge_page(page);
130 }
103 spin_unlock(&hugetlb_lock); 131 spin_unlock(&hugetlb_lock);
104} 132}
105 133
106static int alloc_fresh_huge_page(void) 134/*
135 * Increment or decrement surplus_huge_pages. Keep node-specific counters
136 * balanced by operating on them in a round-robin fashion.
137 * Returns 1 if an adjustment was made.
138 */
139static int adjust_pool_surplus(int delta)
107{ 140{
108 static int prev_nid; 141 static int prev_nid;
109 struct page *page; 142 int nid = prev_nid;
110 int nid; 143 int ret = 0;
144
145 VM_BUG_ON(delta != -1 && delta != 1);
146 do {
147 nid = next_node(nid, node_online_map);
148 if (nid == MAX_NUMNODES)
149 nid = first_node(node_online_map);
150
151 /* To shrink on this node, there must be a surplus page */
152 if (delta < 0 && !surplus_huge_pages_node[nid])
153 continue;
154 /* Surplus cannot exceed the total number of pages */
155 if (delta > 0 && surplus_huge_pages_node[nid] >=
156 nr_huge_pages_node[nid])
157 continue;
158
159 surplus_huge_pages += delta;
160 surplus_huge_pages_node[nid] += delta;
161 ret = 1;
162 break;
163 } while (nid != prev_nid);
111 164
112 /*
113 * Copy static prev_nid to local nid, work on that, then copy it
114 * back to prev_nid afterwards: otherwise there's a window in which
115 * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
116 * But we don't need to use a spin_lock here: it really doesn't
117 * matter if occasionally a racer chooses the same nid as we do.
118 */
119 nid = next_node(prev_nid, node_online_map);
120 if (nid == MAX_NUMNODES)
121 nid = first_node(node_online_map);
122 prev_nid = nid; 165 prev_nid = nid;
166 return ret;
167}
168
169static struct page *alloc_fresh_huge_page_node(int nid)
170{
171 struct page *page;
123 172
124 page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, 173 page = alloc_pages_node(nid,
174 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
175 HUGETLB_PAGE_ORDER);
176 if (page) {
177 set_compound_page_dtor(page, free_huge_page);
178 spin_lock(&hugetlb_lock);
179 nr_huge_pages++;
180 nr_huge_pages_node[nid]++;
181 spin_unlock(&hugetlb_lock);
182 put_page(page); /* free it into the hugepage allocator */
183 }
184
185 return page;
186}
187
188static int alloc_fresh_huge_page(void)
189{
190 struct page *page;
191 int start_nid;
192 int next_nid;
193 int ret = 0;
194
195 start_nid = hugetlb_next_nid;
196
197 do {
198 page = alloc_fresh_huge_page_node(hugetlb_next_nid);
199 if (page)
200 ret = 1;
201 /*
202 * Use a helper variable to find the next node and then
203 * copy it back to hugetlb_next_nid afterwards:
204 * otherwise there's a window in which a racer might
205 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
206 * But we don't need to use a spin_lock here: it really
207 * doesn't matter if occasionally a racer chooses the
208 * same nid as we do. Move nid forward in the mask even
209 * if we just successfully allocated a hugepage so that
210 * the next caller gets hugepages on the next node.
211 */
212 next_nid = next_node(hugetlb_next_nid, node_online_map);
213 if (next_nid == MAX_NUMNODES)
214 next_nid = first_node(node_online_map);
215 hugetlb_next_nid = next_nid;
216 } while (!page && hugetlb_next_nid != start_nid);
217
218 return ret;
219}
220
221static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
222 unsigned long address)
223{
224 struct page *page;
225
226 /* Check if the dynamic pool is enabled */
227 if (!hugetlb_dynamic_pool)
228 return NULL;
229
230 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
125 HUGETLB_PAGE_ORDER); 231 HUGETLB_PAGE_ORDER);
126 if (page) { 232 if (page) {
127 set_compound_page_dtor(page, free_huge_page); 233 set_compound_page_dtor(page, free_huge_page);
128 spin_lock(&hugetlb_lock); 234 spin_lock(&hugetlb_lock);
129 nr_huge_pages++; 235 nr_huge_pages++;
130 nr_huge_pages_node[page_to_nid(page)]++; 236 nr_huge_pages_node[page_to_nid(page)]++;
237 surplus_huge_pages++;
238 surplus_huge_pages_node[page_to_nid(page)]++;
131 spin_unlock(&hugetlb_lock); 239 spin_unlock(&hugetlb_lock);
132 put_page(page); /* free it into the hugepage allocator */
133 return 1;
134 } 240 }
135 return 0; 241
242 return page;
243}
244
245/*
246 * Increase the hugetlb pool such that it can accomodate a reservation
247 * of size 'delta'.
248 */
249static int gather_surplus_pages(int delta)
250{
251 struct list_head surplus_list;
252 struct page *page, *tmp;
253 int ret, i;
254 int needed, allocated;
255
256 needed = (resv_huge_pages + delta) - free_huge_pages;
257 if (needed <= 0)
258 return 0;
259
260 allocated = 0;
261 INIT_LIST_HEAD(&surplus_list);
262
263 ret = -ENOMEM;
264retry:
265 spin_unlock(&hugetlb_lock);
266 for (i = 0; i < needed; i++) {
267 page = alloc_buddy_huge_page(NULL, 0);
268 if (!page) {
269 /*
270 * We were not able to allocate enough pages to
271 * satisfy the entire reservation so we free what
272 * we've allocated so far.
273 */
274 spin_lock(&hugetlb_lock);
275 needed = 0;
276 goto free;
277 }
278
279 list_add(&page->lru, &surplus_list);
280 }
281 allocated += needed;
282
283 /*
284 * After retaking hugetlb_lock, we need to recalculate 'needed'
285 * because either resv_huge_pages or free_huge_pages may have changed.
286 */
287 spin_lock(&hugetlb_lock);
288 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
289 if (needed > 0)
290 goto retry;
291
292 /*
293 * The surplus_list now contains _at_least_ the number of extra pages
294 * needed to accomodate the reservation. Add the appropriate number
295 * of pages to the hugetlb pool and free the extras back to the buddy
296 * allocator.
297 */
298 needed += allocated;
299 ret = 0;
300free:
301 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
302 list_del(&page->lru);
303 if ((--needed) >= 0)
304 enqueue_huge_page(page);
305 else {
306 /*
307 * Decrement the refcount and free the page using its
308 * destructor. This must be done with hugetlb_lock
309 * unlocked which is safe because free_huge_page takes
310 * hugetlb_lock before deciding how to free the page.
311 */
312 spin_unlock(&hugetlb_lock);
313 put_page(page);
314 spin_lock(&hugetlb_lock);
315 }
316 }
317
318 return ret;
319}
320
321/*
322 * When releasing a hugetlb pool reservation, any surplus pages that were
323 * allocated to satisfy the reservation must be explicitly freed if they were
324 * never used.
325 */
326void return_unused_surplus_pages(unsigned long unused_resv_pages)
327{
328 static int nid = -1;
329 struct page *page;
330 unsigned long nr_pages;
331
332 nr_pages = min(unused_resv_pages, surplus_huge_pages);
333
334 while (nr_pages) {
335 nid = next_node(nid, node_online_map);
336 if (nid == MAX_NUMNODES)
337 nid = first_node(node_online_map);
338
339 if (!surplus_huge_pages_node[nid])
340 continue;
341
342 if (!list_empty(&hugepage_freelists[nid])) {
343 page = list_entry(hugepage_freelists[nid].next,
344 struct page, lru);
345 list_del(&page->lru);
346 update_and_free_page(page);
347 free_huge_pages--;
348 free_huge_pages_node[nid]--;
349 surplus_huge_pages--;
350 surplus_huge_pages_node[nid]--;
351 nr_pages--;
352 }
353 }
136} 354}
137 355
138static struct page *alloc_huge_page(struct vm_area_struct *vma, 356static struct page *alloc_huge_page(struct vm_area_struct *vma,
139 unsigned long addr) 357 unsigned long addr)
140{ 358{
141 struct page *page; 359 struct page *page = NULL;
360 int use_reserved_page = vma->vm_flags & VM_MAYSHARE;
142 361
143 spin_lock(&hugetlb_lock); 362 spin_lock(&hugetlb_lock);
144 if (vma->vm_flags & VM_MAYSHARE) 363 if (!use_reserved_page && (free_huge_pages <= resv_huge_pages))
145 resv_huge_pages--;
146 else if (free_huge_pages <= resv_huge_pages)
147 goto fail; 364 goto fail;
148 365
149 page = dequeue_huge_page(vma, addr); 366 page = dequeue_huge_page(vma, addr);
@@ -155,10 +372,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
155 return page; 372 return page;
156 373
157fail: 374fail:
158 if (vma->vm_flags & VM_MAYSHARE)
159 resv_huge_pages++;
160 spin_unlock(&hugetlb_lock); 375 spin_unlock(&hugetlb_lock);
161 return NULL; 376
377 /*
378 * Private mappings do not use reserved huge pages so the allocation
379 * may have failed due to an undersized hugetlb pool. Try to grab a
380 * surplus huge page from the buddy allocator.
381 */
382 if (!use_reserved_page)
383 page = alloc_buddy_huge_page(vma, addr);
384
385 return page;
162} 386}
163 387
164static int __init hugetlb_init(void) 388static int __init hugetlb_init(void)
@@ -171,6 +395,8 @@ static int __init hugetlb_init(void)
171 for (i = 0; i < MAX_NUMNODES; ++i) 395 for (i = 0; i < MAX_NUMNODES; ++i)
172 INIT_LIST_HEAD(&hugepage_freelists[i]); 396 INIT_LIST_HEAD(&hugepage_freelists[i]);
173 397
398 hugetlb_next_nid = first_node(node_online_map);
399
174 for (i = 0; i < max_huge_pages; ++i) { 400 for (i = 0; i < max_huge_pages; ++i) {
175 if (!alloc_fresh_huge_page()) 401 if (!alloc_fresh_huge_page())
176 break; 402 break;
@@ -201,21 +427,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
201} 427}
202 428
203#ifdef CONFIG_SYSCTL 429#ifdef CONFIG_SYSCTL
204static void update_and_free_page(struct page *page)
205{
206 int i;
207 nr_huge_pages--;
208 nr_huge_pages_node[page_to_nid(page)]--;
209 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
210 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
211 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
212 1 << PG_private | 1<< PG_writeback);
213 }
214 set_compound_page_dtor(page, NULL);
215 set_page_refcounted(page);
216 __free_pages(page, HUGETLB_PAGE_ORDER);
217}
218
219#ifdef CONFIG_HIGHMEM 430#ifdef CONFIG_HIGHMEM
220static void try_to_free_low(unsigned long count) 431static void try_to_free_low(unsigned long count)
221{ 432{
@@ -224,14 +435,14 @@ static void try_to_free_low(unsigned long count)
224 for (i = 0; i < MAX_NUMNODES; ++i) { 435 for (i = 0; i < MAX_NUMNODES; ++i) {
225 struct page *page, *next; 436 struct page *page, *next;
226 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 437 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
438 if (count >= nr_huge_pages)
439 return;
227 if (PageHighMem(page)) 440 if (PageHighMem(page))
228 continue; 441 continue;
229 list_del(&page->lru); 442 list_del(&page->lru);
230 update_and_free_page(page); 443 update_and_free_page(page);
231 free_huge_pages--; 444 free_huge_pages--;
232 free_huge_pages_node[page_to_nid(page)]--; 445 free_huge_pages_node[page_to_nid(page)]--;
233 if (count >= nr_huge_pages)
234 return;
235 } 446 }
236 } 447 }
237} 448}
@@ -241,26 +452,61 @@ static inline void try_to_free_low(unsigned long count)
241} 452}
242#endif 453#endif
243 454
455#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
244static unsigned long set_max_huge_pages(unsigned long count) 456static unsigned long set_max_huge_pages(unsigned long count)
245{ 457{
246 while (count > nr_huge_pages) { 458 unsigned long min_count, ret;
247 if (!alloc_fresh_huge_page())
248 return nr_huge_pages;
249 }
250 if (count >= nr_huge_pages)
251 return nr_huge_pages;
252 459
460 /*
461 * Increase the pool size
462 * First take pages out of surplus state. Then make up the
463 * remaining difference by allocating fresh huge pages.
464 */
253 spin_lock(&hugetlb_lock); 465 spin_lock(&hugetlb_lock);
254 count = max(count, resv_huge_pages); 466 while (surplus_huge_pages && count > persistent_huge_pages) {
255 try_to_free_low(count); 467 if (!adjust_pool_surplus(-1))
256 while (count < nr_huge_pages) { 468 break;
469 }
470
471 while (count > persistent_huge_pages) {
472 int ret;
473 /*
474 * If this allocation races such that we no longer need the
475 * page, free_huge_page will handle it by freeing the page
476 * and reducing the surplus.
477 */
478 spin_unlock(&hugetlb_lock);
479 ret = alloc_fresh_huge_page();
480 spin_lock(&hugetlb_lock);
481 if (!ret)
482 goto out;
483
484 }
485
486 /*
487 * Decrease the pool size
488 * First return free pages to the buddy allocator (being careful
489 * to keep enough around to satisfy reservations). Then place
490 * pages into surplus state as needed so the pool will shrink
491 * to the desired size as pages become free.
492 */
493 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
494 min_count = max(count, min_count);
495 try_to_free_low(min_count);
496 while (min_count < persistent_huge_pages) {
257 struct page *page = dequeue_huge_page(NULL, 0); 497 struct page *page = dequeue_huge_page(NULL, 0);
258 if (!page) 498 if (!page)
259 break; 499 break;
260 update_and_free_page(page); 500 update_and_free_page(page);
261 } 501 }
502 while (count < persistent_huge_pages) {
503 if (!adjust_pool_surplus(1))
504 break;
505 }
506out:
507 ret = persistent_huge_pages;
262 spin_unlock(&hugetlb_lock); 508 spin_unlock(&hugetlb_lock);
263 return nr_huge_pages; 509 return ret;
264} 510}
265 511
266int hugetlb_sysctl_handler(struct ctl_table *table, int write, 512int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -292,10 +538,12 @@ int hugetlb_report_meminfo(char *buf)
292 "HugePages_Total: %5lu\n" 538 "HugePages_Total: %5lu\n"
293 "HugePages_Free: %5lu\n" 539 "HugePages_Free: %5lu\n"
294 "HugePages_Rsvd: %5lu\n" 540 "HugePages_Rsvd: %5lu\n"
541 "HugePages_Surp: %5lu\n"
295 "Hugepagesize: %5lu kB\n", 542 "Hugepagesize: %5lu kB\n",
296 nr_huge_pages, 543 nr_huge_pages,
297 free_huge_pages, 544 free_huge_pages,
298 resv_huge_pages, 545 resv_huge_pages,
546 surplus_huge_pages,
299 HPAGE_SIZE/1024); 547 HPAGE_SIZE/1024);
300} 548}
301 549
@@ -355,7 +603,6 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
355 entry = pte_mkwrite(pte_mkdirty(*ptep)); 603 entry = pte_mkwrite(pte_mkdirty(*ptep));
356 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { 604 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
357 update_mmu_cache(vma, address, entry); 605 update_mmu_cache(vma, address, entry);
358 lazy_mmu_prot_update(entry);
359 } 606 }
360} 607}
361 608
@@ -708,7 +955,6 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
708 pte = huge_ptep_get_and_clear(mm, address, ptep); 955 pte = huge_ptep_get_and_clear(mm, address, ptep);
709 pte = pte_mkhuge(pte_modify(pte, newprot)); 956 pte = pte_mkhuge(pte_modify(pte, newprot));
710 set_huge_pte_at(mm, address, ptep, pte); 957 set_huge_pte_at(mm, address, ptep, pte);
711 lazy_mmu_prot_update(pte);
712 } 958 }
713 } 959 }
714 spin_unlock(&mm->page_table_lock); 960 spin_unlock(&mm->page_table_lock);
@@ -843,21 +1089,6 @@ static int hugetlb_acct_memory(long delta)
843 int ret = -ENOMEM; 1089 int ret = -ENOMEM;
844 1090
845 spin_lock(&hugetlb_lock); 1091 spin_lock(&hugetlb_lock);
846 if ((delta + resv_huge_pages) <= free_huge_pages) {
847 resv_huge_pages += delta;
848 ret = 0;
849 }
850 spin_unlock(&hugetlb_lock);
851 return ret;
852}
853
854int hugetlb_reserve_pages(struct inode *inode, long from, long to)
855{
856 long ret, chg;
857
858 chg = region_chg(&inode->i_mapping->private_list, from, to);
859 if (chg < 0)
860 return chg;
861 /* 1092 /*
862 * When cpuset is configured, it breaks the strict hugetlb page 1093 * When cpuset is configured, it breaks the strict hugetlb page
863 * reservation as the accounting is done on a global variable. Such 1094 * reservation as the accounting is done on a global variable. Such
@@ -875,8 +1106,31 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
875 * a best attempt and hopefully to minimize the impact of changing 1106 * a best attempt and hopefully to minimize the impact of changing
876 * semantics that cpuset has. 1107 * semantics that cpuset has.
877 */ 1108 */
878 if (chg > cpuset_mems_nr(free_huge_pages_node)) 1109 if (delta > 0) {
879 return -ENOMEM; 1110 if (gather_surplus_pages(delta) < 0)
1111 goto out;
1112
1113 if (delta > cpuset_mems_nr(free_huge_pages_node))
1114 goto out;
1115 }
1116
1117 ret = 0;
1118 resv_huge_pages += delta;
1119 if (delta < 0)
1120 return_unused_surplus_pages((unsigned long) -delta);
1121
1122out:
1123 spin_unlock(&hugetlb_lock);
1124 return ret;
1125}
1126
1127int hugetlb_reserve_pages(struct inode *inode, long from, long to)
1128{
1129 long ret, chg;
1130
1131 chg = region_chg(&inode->i_mapping->private_list, from, to);
1132 if (chg < 0)
1133 return chg;
880 1134
881 ret = hugetlb_acct_memory(chg); 1135 ret = hugetlb_acct_memory(chg);
882 if (ret < 0) 1136 if (ret < 0)