diff options
author | Adam Litke <agl@us.ibm.com> | 2007-10-16 04:26:19 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-16 12:43:02 -0400 |
commit | e4e574b767ba63101cfda2b42d72f38546319297 (patch) | |
tree | 084b94d01c71ccd898f8df0ec441e6726e657e75 /mm/hugetlb.c | |
parent | 7893d1d505d59db9d4f35165c8b6d3c6dff40a32 (diff) |
hugetlb: Try to grow hugetlb pool for MAP_SHARED mappings
Shared mappings require special handling because the huge pages needed to
fully populate the VMA must be reserved at mmap time. If not enough pages are
available when making the reservation, allocate all of the shortfall at once
from the buddy allocator and add the pages directly to the hugetlb pool. If
they cannot be allocated, then fail the mapping. The page surplus is
accounted for in the same way as for private mappings; faulted surplus pages
will be freed at unmap time. Reserved, surplus pages that have not been used
must be freed separately when their reservation has been released.
Signed-off-by: Adam Litke <agl@us.ibm.com>
Acked-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Dave McCracken <dave.mccracken@oracle.com>
Cc: William Irwin <bill.irwin@oracle.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Ken Chen <kenchen@google.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 155 |
1 files changed, 132 insertions, 23 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8768e525032..31bbca6b2c9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -87,6 +87,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |||
87 | list_del(&page->lru); | 87 | list_del(&page->lru); |
88 | free_huge_pages--; | 88 | free_huge_pages--; |
89 | free_huge_pages_node[nid]--; | 89 | free_huge_pages_node[nid]--; |
90 | if (vma && vma->vm_flags & VM_MAYSHARE) | ||
91 | resv_huge_pages--; | ||
90 | break; | 92 | break; |
91 | } | 93 | } |
92 | } | 94 | } |
@@ -214,15 +216,116 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
214 | return page; | 216 | return page; |
215 | } | 217 | } |
216 | 218 | ||
219 | /* | ||
220 | * Increase the hugetlb pool such that it can accomodate a reservation | ||
221 | * of size 'delta'. | ||
222 | */ | ||
223 | static int gather_surplus_pages(int delta) | ||
224 | { | ||
225 | struct list_head surplus_list; | ||
226 | struct page *page, *tmp; | ||
227 | int ret, i; | ||
228 | int needed, allocated; | ||
229 | |||
230 | needed = (resv_huge_pages + delta) - free_huge_pages; | ||
231 | if (needed <= 0) | ||
232 | return 0; | ||
233 | |||
234 | allocated = 0; | ||
235 | INIT_LIST_HEAD(&surplus_list); | ||
236 | |||
237 | ret = -ENOMEM; | ||
238 | retry: | ||
239 | spin_unlock(&hugetlb_lock); | ||
240 | for (i = 0; i < needed; i++) { | ||
241 | page = alloc_buddy_huge_page(NULL, 0); | ||
242 | if (!page) { | ||
243 | /* | ||
244 | * We were not able to allocate enough pages to | ||
245 | * satisfy the entire reservation so we free what | ||
246 | * we've allocated so far. | ||
247 | */ | ||
248 | spin_lock(&hugetlb_lock); | ||
249 | needed = 0; | ||
250 | goto free; | ||
251 | } | ||
252 | |||
253 | list_add(&page->lru, &surplus_list); | ||
254 | } | ||
255 | allocated += needed; | ||
256 | |||
257 | /* | ||
258 | * After retaking hugetlb_lock, we need to recalculate 'needed' | ||
259 | * because either resv_huge_pages or free_huge_pages may have changed. | ||
260 | */ | ||
261 | spin_lock(&hugetlb_lock); | ||
262 | needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); | ||
263 | if (needed > 0) | ||
264 | goto retry; | ||
265 | |||
266 | /* | ||
267 | * The surplus_list now contains _at_least_ the number of extra pages | ||
268 | * needed to accomodate the reservation. Add the appropriate number | ||
269 | * of pages to the hugetlb pool and free the extras back to the buddy | ||
270 | * allocator. | ||
271 | */ | ||
272 | needed += allocated; | ||
273 | ret = 0; | ||
274 | free: | ||
275 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | ||
276 | list_del(&page->lru); | ||
277 | if ((--needed) >= 0) | ||
278 | enqueue_huge_page(page); | ||
279 | else | ||
280 | update_and_free_page(page); | ||
281 | } | ||
282 | |||
283 | return ret; | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * When releasing a hugetlb pool reservation, any surplus pages that were | ||
288 | * allocated to satisfy the reservation must be explicitly freed if they were | ||
289 | * never used. | ||
290 | */ | ||
291 | void return_unused_surplus_pages(unsigned long unused_resv_pages) | ||
292 | { | ||
293 | static int nid = -1; | ||
294 | struct page *page; | ||
295 | unsigned long nr_pages; | ||
296 | |||
297 | nr_pages = min(unused_resv_pages, surplus_huge_pages); | ||
298 | |||
299 | while (nr_pages) { | ||
300 | nid = next_node(nid, node_online_map); | ||
301 | if (nid == MAX_NUMNODES) | ||
302 | nid = first_node(node_online_map); | ||
303 | |||
304 | if (!surplus_huge_pages_node[nid]) | ||
305 | continue; | ||
306 | |||
307 | if (!list_empty(&hugepage_freelists[nid])) { | ||
308 | page = list_entry(hugepage_freelists[nid].next, | ||
309 | struct page, lru); | ||
310 | list_del(&page->lru); | ||
311 | update_and_free_page(page); | ||
312 | free_huge_pages--; | ||
313 | free_huge_pages_node[nid]--; | ||
314 | surplus_huge_pages--; | ||
315 | surplus_huge_pages_node[nid]--; | ||
316 | nr_pages--; | ||
317 | } | ||
318 | } | ||
319 | } | ||
320 | |||
217 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 321 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
218 | unsigned long addr) | 322 | unsigned long addr) |
219 | { | 323 | { |
220 | struct page *page = NULL; | 324 | struct page *page = NULL; |
325 | int use_reserved_page = vma->vm_flags & VM_MAYSHARE; | ||
221 | 326 | ||
222 | spin_lock(&hugetlb_lock); | 327 | spin_lock(&hugetlb_lock); |
223 | if (vma->vm_flags & VM_MAYSHARE) | 328 | if (!use_reserved_page && (free_huge_pages <= resv_huge_pages)) |
224 | resv_huge_pages--; | ||
225 | else if (free_huge_pages <= resv_huge_pages) | ||
226 | goto fail; | 329 | goto fail; |
227 | 330 | ||
228 | page = dequeue_huge_page(vma, addr); | 331 | page = dequeue_huge_page(vma, addr); |
@@ -234,8 +337,6 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
234 | return page; | 337 | return page; |
235 | 338 | ||
236 | fail: | 339 | fail: |
237 | if (vma->vm_flags & VM_MAYSHARE) | ||
238 | resv_huge_pages++; | ||
239 | spin_unlock(&hugetlb_lock); | 340 | spin_unlock(&hugetlb_lock); |
240 | 341 | ||
241 | /* | 342 | /* |
@@ -243,7 +344,7 @@ fail: | |||
243 | * may have failed due to an undersized hugetlb pool. Try to grab a | 344 | * may have failed due to an undersized hugetlb pool. Try to grab a |
244 | * surplus huge page from the buddy allocator. | 345 | * surplus huge page from the buddy allocator. |
245 | */ | 346 | */ |
246 | if (!(vma->vm_flags & VM_MAYSHARE)) | 347 | if (!use_reserved_page) |
247 | page = alloc_buddy_huge_page(vma, addr); | 348 | page = alloc_buddy_huge_page(vma, addr); |
248 | 349 | ||
249 | return page; | 350 | return page; |
@@ -952,21 +1053,6 @@ static int hugetlb_acct_memory(long delta) | |||
952 | int ret = -ENOMEM; | 1053 | int ret = -ENOMEM; |
953 | 1054 | ||
954 | spin_lock(&hugetlb_lock); | 1055 | spin_lock(&hugetlb_lock); |
955 | if ((delta + resv_huge_pages) <= free_huge_pages) { | ||
956 | resv_huge_pages += delta; | ||
957 | ret = 0; | ||
958 | } | ||
959 | spin_unlock(&hugetlb_lock); | ||
960 | return ret; | ||
961 | } | ||
962 | |||
963 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | ||
964 | { | ||
965 | long ret, chg; | ||
966 | |||
967 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
968 | if (chg < 0) | ||
969 | return chg; | ||
970 | /* | 1056 | /* |
971 | * When cpuset is configured, it breaks the strict hugetlb page | 1057 | * When cpuset is configured, it breaks the strict hugetlb page |
972 | * reservation as the accounting is done on a global variable. Such | 1058 | * reservation as the accounting is done on a global variable. Such |
@@ -984,8 +1070,31 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to) | |||
984 | * a best attempt and hopefully to minimize the impact of changing | 1070 | * a best attempt and hopefully to minimize the impact of changing |
985 | * semantics that cpuset has. | 1071 | * semantics that cpuset has. |
986 | */ | 1072 | */ |
987 | if (chg > cpuset_mems_nr(free_huge_pages_node)) | 1073 | if (delta > 0) { |
988 | return -ENOMEM; | 1074 | if (gather_surplus_pages(delta) < 0) |
1075 | goto out; | ||
1076 | |||
1077 | if (delta > cpuset_mems_nr(free_huge_pages_node)) | ||
1078 | goto out; | ||
1079 | } | ||
1080 | |||
1081 | ret = 0; | ||
1082 | resv_huge_pages += delta; | ||
1083 | if (delta < 0) | ||
1084 | return_unused_surplus_pages((unsigned long) -delta); | ||
1085 | |||
1086 | out: | ||
1087 | spin_unlock(&hugetlb_lock); | ||
1088 | return ret; | ||
1089 | } | ||
1090 | |||
1091 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | ||
1092 | { | ||
1093 | long ret, chg; | ||
1094 | |||
1095 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
1096 | if (chg < 0) | ||
1097 | return chg; | ||
989 | 1098 | ||
990 | ret = hugetlb_acct_memory(chg); | 1099 | ret = hugetlb_acct_memory(chg); |
991 | if (ret < 0) | 1100 | if (ret < 0) |