aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2008-07-24 00:27:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-24 13:47:16 -0400
commita1e78772d72b2616ed20e54896e68e0e7044854e (patch)
treed752dd96c2a4fcc555779a7aa99f95069c9b95ae
parentfc1b8a73dd71226902a11928dd5500326e101df9 (diff)
hugetlb: reserve huge pages for reliable MAP_PRIVATE hugetlbfs mappings until fork()
This patch reserves huge pages at mmap() time for MAP_PRIVATE mappings in a similar manner to the reservations taken for MAP_SHARED mappings. The reserve count is accounted both globally and on a per-VMA basis for private mappings. This guarantees that a process that successfully calls mmap() will successfully fault all pages in the future unless fork() is called. The characteristics of private mappings of hugetlbfs files behaviour after this patch are; 1. The process calling mmap() is guaranteed to succeed all future faults until it forks(). 2. On fork(), the parent may die due to SIGKILL on writes to the private mapping if enough pages are not available for the COW. For reasonably reliable behaviour in the face of a small huge page pool, children of hugepage-aware processes should not reference the mappings; such as might occur when fork()ing to exec(). 3. On fork(), the child VMAs inherit no reserves. Reads on pages already faulted by the parent will succeed. Successful writes will depend on enough huge pages being free in the pool. 4. Quotas of the hugetlbfs mount are checked at reserve time for the mapper and at fault time otherwise. Before this patch, all reads or writes in the child potentially needs page allocations that can later lead to the death of the parent. This applies to reads and writes of uninstantiated pages as well as COW. After the patch it is only a write to an instantiated page that causes problems. Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Adam Litke <agl@us.ibm.com> Cc: Andy Whitcroft <apw@shadowen.org> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/hugetlbfs/inode.c8
-rw-r--r--include/linux/hugetlb.h9
-rw-r--r--kernel/fork.c9
-rw-r--r--mm/hugetlb.c158
4 files changed, 140 insertions, 44 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index aeabf80f81a5..1576bbecd084 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -103,9 +103,9 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
103 ret = -ENOMEM; 103 ret = -ENOMEM;
104 len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 104 len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
105 105
106 if (vma->vm_flags & VM_MAYSHARE && 106 if (hugetlb_reserve_pages(inode,
107 hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT), 107 vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
108 len >> HPAGE_SHIFT)) 108 len >> HPAGE_SHIFT, vma))
109 goto out; 109 goto out;
110 110
111 ret = 0; 111 ret = 0;
@@ -942,7 +942,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
942 goto out_dentry; 942 goto out_dentry;
943 943
944 error = -ENOMEM; 944 error = -ENOMEM;
945 if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT)) 945 if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT, NULL))
946 goto out_inode; 946 goto out_inode;
947 947
948 d_instantiate(dentry, inode); 948 d_instantiate(dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a79e80b689d8..185b14c9f021 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -17,6 +17,7 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
17 return vma->vm_flags & VM_HUGETLB; 17 return vma->vm_flags & VM_HUGETLB;
18} 18}
19 19
20void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
20int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); 21int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
21int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); 22int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
22int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); 23int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
@@ -30,7 +31,8 @@ int hugetlb_report_node_meminfo(int, char *);
30unsigned long hugetlb_total_pages(void); 31unsigned long hugetlb_total_pages(void);
31int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 32int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
32 unsigned long address, int write_access); 33 unsigned long address, int write_access);
33int hugetlb_reserve_pages(struct inode *inode, long from, long to); 34int hugetlb_reserve_pages(struct inode *inode, long from, long to,
35 struct vm_area_struct *vma);
34void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); 36void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
35 37
36extern unsigned long max_huge_pages; 38extern unsigned long max_huge_pages;
@@ -58,6 +60,11 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
58{ 60{
59 return 0; 61 return 0;
60} 62}
63
64static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
65{
66}
67
61static inline unsigned long hugetlb_total_pages(void) 68static inline unsigned long hugetlb_total_pages(void)
62{ 69{
63 return 0; 70 return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index adefc1131f27..552c8d8e77ad 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -33,6 +33,7 @@
33#include <linux/cpu.h> 33#include <linux/cpu.h>
34#include <linux/cgroup.h> 34#include <linux/cgroup.h>
35#include <linux/security.h> 35#include <linux/security.h>
36#include <linux/hugetlb.h>
36#include <linux/swap.h> 37#include <linux/swap.h>
37#include <linux/syscalls.h> 38#include <linux/syscalls.h>
38#include <linux/jiffies.h> 39#include <linux/jiffies.h>
@@ -307,6 +308,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
307 } 308 }
308 309
309 /* 310 /*
311 * Clear hugetlb-related page reserves for children. This only
312 * affects MAP_PRIVATE mappings. Faults generated by the child
313 * are not guaranteed to succeed, even if read-only
314 */
315 if (is_vm_hugetlb_page(tmp))
316 reset_vma_resv_huge_pages(tmp);
317
318 /*
310 * Link in the new vma and copy the page table entries. 319 * Link in the new vma and copy the page table entries.
311 */ 320 */
312 *pprev = tmp; 321 *pprev = tmp;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a4dbba8965f3..0af500db3632 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,6 +40,69 @@ static int hugetlb_next_nid;
40 */ 40 */
41static DEFINE_SPINLOCK(hugetlb_lock); 41static DEFINE_SPINLOCK(hugetlb_lock);
42 42
43/*
44 * These helpers are used to track how many pages are reserved for
45 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
46 * is guaranteed to have their future faults succeed.
47 *
48 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
49 * the reserve counters are updated with the hugetlb_lock held. It is safe
50 * to reset the VMA at fork() time as it is not in use yet and there is no
51 * chance of the global counters getting corrupted as a result of the values.
52 */
53static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
54{
55 VM_BUG_ON(!is_vm_hugetlb_page(vma));
56 if (!(vma->vm_flags & VM_SHARED))
57 return (unsigned long)vma->vm_private_data;
58 return 0;
59}
60
61static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
62 unsigned long reserve)
63{
64 VM_BUG_ON(!is_vm_hugetlb_page(vma));
65 VM_BUG_ON(vma->vm_flags & VM_SHARED);
66
67 vma->vm_private_data = (void *)reserve;
68}
69
70/* Decrement the reserved pages in the hugepage pool by one */
71static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
72{
73 if (vma->vm_flags & VM_SHARED) {
74 /* Shared mappings always use reserves */
75 resv_huge_pages--;
76 } else {
77 /*
78 * Only the process that called mmap() has reserves for
79 * private mappings.
80 */
81 if (vma_resv_huge_pages(vma)) {
82 resv_huge_pages--;
83 reserve = (unsigned long)vma->vm_private_data - 1;
84 vma->vm_private_data = (void *)reserve;
85 }
86 }
87}
88
89void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
90{
91 VM_BUG_ON(!is_vm_hugetlb_page(vma));
92 if (!(vma->vm_flags & VM_SHARED))
93 vma->vm_private_data = (void *)0;
94}
95
96/* Returns true if the VMA has associated reserve pages */
97static int vma_has_private_reserves(struct vm_area_struct *vma)
98{
99 if (vma->vm_flags & VM_SHARED)
100 return 0;
101 if (!vma_resv_huge_pages(vma))
102 return 0;
103 return 1;
104}
105
43static void clear_huge_page(struct page *page, unsigned long addr) 106static void clear_huge_page(struct page *page, unsigned long addr)
44{ 107{
45 int i; 108 int i;
@@ -101,6 +164,15 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
101 struct zone *zone; 164 struct zone *zone;
102 struct zoneref *z; 165 struct zoneref *z;
103 166
167 /*
168 * A child process with MAP_PRIVATE mappings created by their parent
169 * have no page reserves. This check ensures that reservations are
170 * not "stolen". The child may still get SIGKILLed
171 */
172 if (!vma_has_private_reserves(vma) &&
173 free_huge_pages - resv_huge_pages == 0)
174 return NULL;
175
104 for_each_zone_zonelist_nodemask(zone, z, zonelist, 176 for_each_zone_zonelist_nodemask(zone, z, zonelist,
105 MAX_NR_ZONES - 1, nodemask) { 177 MAX_NR_ZONES - 1, nodemask) {
106 nid = zone_to_nid(zone); 178 nid = zone_to_nid(zone);
@@ -111,8 +183,8 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
111 list_del(&page->lru); 183 list_del(&page->lru);
112 free_huge_pages--; 184 free_huge_pages--;
113 free_huge_pages_node[nid]--; 185 free_huge_pages_node[nid]--;
114 if (vma && vma->vm_flags & VM_MAYSHARE) 186 decrement_hugepage_resv_vma(vma);
115 resv_huge_pages--; 187
116 break; 188 break;
117 } 189 }
118 } 190 }
@@ -461,55 +533,40 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
461 } 533 }
462} 534}
463 535
464 536static struct page *alloc_huge_page(struct vm_area_struct *vma,
465static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, 537 unsigned long addr)
466 unsigned long addr)
467{ 538{
468 struct page *page; 539 struct page *page;
540 struct address_space *mapping = vma->vm_file->f_mapping;
541 struct inode *inode = mapping->host;
542 unsigned int chg = 0;
543
544 /*
545 * Processes that did not create the mapping will have no reserves and
546 * will not have accounted against quota. Check that the quota can be
547 * made before satisfying the allocation
548 */
549 if (!vma_has_private_reserves(vma)) {
550 chg = 1;
551 if (hugetlb_get_quota(inode->i_mapping, chg))
552 return ERR_PTR(-ENOSPC);
553 }
469 554
470 spin_lock(&hugetlb_lock); 555 spin_lock(&hugetlb_lock);
471 page = dequeue_huge_page_vma(vma, addr); 556 page = dequeue_huge_page_vma(vma, addr);
472 spin_unlock(&hugetlb_lock); 557 spin_unlock(&hugetlb_lock);
473 return page ? page : ERR_PTR(-VM_FAULT_OOM);
474}
475 558
476static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
477 unsigned long addr)
478{
479 struct page *page = NULL;
480
481 if (hugetlb_get_quota(vma->vm_file->f_mapping, 1))
482 return ERR_PTR(-VM_FAULT_SIGBUS);
483
484 spin_lock(&hugetlb_lock);
485 if (free_huge_pages > resv_huge_pages)
486 page = dequeue_huge_page_vma(vma, addr);
487 spin_unlock(&hugetlb_lock);
488 if (!page) { 559 if (!page) {
489 page = alloc_buddy_huge_page(vma, addr); 560 page = alloc_buddy_huge_page(vma, addr);
490 if (!page) { 561 if (!page) {
491 hugetlb_put_quota(vma->vm_file->f_mapping, 1); 562 hugetlb_put_quota(inode->i_mapping, chg);
492 return ERR_PTR(-VM_FAULT_OOM); 563 return ERR_PTR(-VM_FAULT_OOM);
493 } 564 }
494 } 565 }
495 return page;
496}
497 566
498static struct page *alloc_huge_page(struct vm_area_struct *vma, 567 set_page_refcounted(page);
499 unsigned long addr) 568 set_page_private(page, (unsigned long) mapping);
500{
501 struct page *page;
502 struct address_space *mapping = vma->vm_file->f_mapping;
503
504 if (vma->vm_flags & VM_MAYSHARE)
505 page = alloc_huge_page_shared(vma, addr);
506 else
507 page = alloc_huge_page_private(vma, addr);
508 569
509 if (!IS_ERR(page)) {
510 set_page_refcounted(page);
511 set_page_private(page, (unsigned long) mapping);
512 }
513 return page; 570 return page;
514} 571}
515 572
@@ -757,6 +814,13 @@ out:
757 return ret; 814 return ret;
758} 815}
759 816
817static void hugetlb_vm_op_close(struct vm_area_struct *vma)
818{
819 unsigned long reserve = vma_resv_huge_pages(vma);
820 if (reserve)
821 hugetlb_acct_memory(-reserve);
822}
823
760/* 824/*
761 * We cannot handle pagefaults against hugetlb pages at all. They cause 825 * We cannot handle pagefaults against hugetlb pages at all. They cause
762 * handle_mm_fault() to try to instantiate regular-sized pages in the 826 * handle_mm_fault() to try to instantiate regular-sized pages in the
@@ -771,6 +835,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
771 835
772struct vm_operations_struct hugetlb_vm_ops = { 836struct vm_operations_struct hugetlb_vm_ops = {
773 .fault = hugetlb_vm_op_fault, 837 .fault = hugetlb_vm_op_fault,
838 .close = hugetlb_vm_op_close,
774}; 839};
775 840
776static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 841static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -1289,11 +1354,25 @@ static long region_truncate(struct list_head *head, long end)
1289 return chg; 1354 return chg;
1290} 1355}
1291 1356
1292int hugetlb_reserve_pages(struct inode *inode, long from, long to) 1357int hugetlb_reserve_pages(struct inode *inode,
1358 long from, long to,
1359 struct vm_area_struct *vma)
1293{ 1360{
1294 long ret, chg; 1361 long ret, chg;
1295 1362
1296 chg = region_chg(&inode->i_mapping->private_list, from, to); 1363 /*
1364 * Shared mappings base their reservation on the number of pages that
1365 * are already allocated on behalf of the file. Private mappings need
1366 * to reserve the full area even if read-only as mprotect() may be
1367 * called to make the mapping read-write. Assume !vma is a shm mapping
1368 */
1369 if (!vma || vma->vm_flags & VM_SHARED)
1370 chg = region_chg(&inode->i_mapping->private_list, from, to);
1371 else {
1372 chg = to - from;
1373 set_vma_resv_huge_pages(vma, chg);
1374 }
1375
1297 if (chg < 0) 1376 if (chg < 0)
1298 return chg; 1377 return chg;
1299 1378
@@ -1304,7 +1383,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
1304 hugetlb_put_quota(inode->i_mapping, chg); 1383 hugetlb_put_quota(inode->i_mapping, chg);
1305 return ret; 1384 return ret;
1306 } 1385 }
1307 region_add(&inode->i_mapping->private_list, from, to); 1386 if (!vma || vma->vm_flags & VM_SHARED)
1387 region_add(&inode->i_mapping->private_list, from, to);
1308 return 0; 1388 return 0;
1309} 1389}
1310 1390