aboutsummaryrefslogtreecommitdiffstats
path: root/fs/hugetlbfs
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2006-03-22 03:08:55 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-22 10:54:03 -0500
commitb45b5bd65f668a665db40d093e4e1fe563533608 (patch)
treeaa3806bd87fd7aa719b561e4d468c779f6adb31b /fs/hugetlbfs
parent3935baa9bcda3ccaee4f7849f5157d316e34412e (diff)
[PATCH] hugepage: Strict page reservation for hugepage inodes
These days, hugepages are demand-allocated at first fault time. There's a somewhat dubious (and racy) heuristic when making a new mmap() to check if there are enough available hugepages to fully satisfy that mapping. A particularly obvious case where the heuristic breaks down is where a process maps its hugepages not as a single chunk, but as a bunch of individually mmap()ed (or shmat()ed) blocks without touching and instantiating the pages in between allocations. In this case the size of each block is compared against the total number of available hugepages. It's thus easy for the process to become overcommitted, because each block mapping will succeed, although the total number of hugepages required by all blocks exceeds the number available. In particular, this defeats such a program which will detect a mapping failure and adjust its hugepage usage downward accordingly. The patch below addresses this problem, by strictly reserving a number of physical hugepages for hugepage inodes which have been mapped, but not instatiated. MAP_SHARED mappings are thus "safe" - they will fail on mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still trigger an OOM. (Actually SHARED mappings can technically still OOM, but only if the sysadmin explicitly reduces the hugepage pool between mapping and instantiation) This patch appears to address the problem at hand - it allows DB2 to start correctly, for instance, which previously suffered the failure described above. This patch causes no regressions on the libhugetblfs testsuite, and makes a test (designed to catch this problem) pass which previously failed (ppc64, POWER5). Signed-off-by: David Gibson <dwg@au1.ibm.com> Cc: William Lee Irwin III <wli@holomorphy.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'fs/hugetlbfs')
-rw-r--r--fs/hugetlbfs/inode.c74
1 files changed, 22 insertions, 52 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b35195289945..1a1c2fcb7823 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -56,48 +56,10 @@ static void huge_pagevec_release(struct pagevec *pvec)
56 pagevec_reinit(pvec); 56 pagevec_reinit(pvec);
57} 57}
58 58
59/*
60 * huge_pages_needed tries to determine the number of new huge pages that
61 * will be required to fully populate this VMA. This will be equal to
62 * the size of the VMA in huge pages minus the number of huge pages
63 * (covered by this VMA) that are found in the page cache.
64 *
65 * Result is in bytes to be compatible with is_hugepage_mem_enough()
66 */
67static unsigned long
68huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
69{
70 int i;
71 struct pagevec pvec;
72 unsigned long start = vma->vm_start;
73 unsigned long end = vma->vm_end;
74 unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
75 pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
76 pgoff_t endpg = next + hugepages;
77
78 pagevec_init(&pvec, 0);
79 while (next < endpg) {
80 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
81 break;
82 for (i = 0; i < pagevec_count(&pvec); i++) {
83 struct page *page = pvec.pages[i];
84 if (page->index > next)
85 next = page->index;
86 if (page->index >= endpg)
87 break;
88 next++;
89 hugepages--;
90 }
91 huge_pagevec_release(&pvec);
92 }
93 return hugepages << HPAGE_SHIFT;
94}
95
96static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 59static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
97{ 60{
98 struct inode *inode = file->f_dentry->d_inode; 61 struct inode *inode = file->f_dentry->d_inode;
99 struct address_space *mapping = inode->i_mapping; 62 struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
100 unsigned long bytes;
101 loff_t len, vma_len; 63 loff_t len, vma_len;
102 int ret; 64 int ret;
103 65
@@ -113,10 +75,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
113 if (vma->vm_end - vma->vm_start < HPAGE_SIZE) 75 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
114 return -EINVAL; 76 return -EINVAL;
115 77
116 bytes = huge_pages_needed(mapping, vma);
117 if (!is_hugepage_mem_enough(bytes))
118 return -ENOMEM;
119
120 vma_len = (loff_t)(vma->vm_end - vma->vm_start); 78 vma_len = (loff_t)(vma->vm_end - vma->vm_start);
121 79
122 mutex_lock(&inode->i_mutex); 80 mutex_lock(&inode->i_mutex);
@@ -129,6 +87,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
129 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) 87 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
130 goto out; 88 goto out;
131 89
90 if (vma->vm_flags & VM_MAYSHARE)
91 if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)
92 goto out;
93
132 ret = 0; 94 ret = 0;
133 hugetlb_prefault_arch_hook(vma->vm_mm); 95 hugetlb_prefault_arch_hook(vma->vm_mm);
134 if (inode->i_size < len) 96 if (inode->i_size < len)
@@ -227,13 +189,18 @@ static void truncate_huge_page(struct page *page)
227 put_page(page); 189 put_page(page);
228} 190}
229 191
230static void truncate_hugepages(struct address_space *mapping, loff_t lstart) 192static void truncate_hugepages(struct inode *inode, loff_t lstart)
231{ 193{
194 struct address_space *mapping = &inode->i_data;
232 const pgoff_t start = lstart >> HPAGE_SHIFT; 195 const pgoff_t start = lstart >> HPAGE_SHIFT;
233 struct pagevec pvec; 196 struct pagevec pvec;
234 pgoff_t next; 197 pgoff_t next;
235 int i; 198 int i;
236 199
200 hugetlb_truncate_reservation(HUGETLBFS_I(inode),
201 lstart >> HPAGE_SHIFT);
202 if (!mapping->nrpages)
203 return;
237 pagevec_init(&pvec, 0); 204 pagevec_init(&pvec, 0);
238 next = start; 205 next = start;
239 while (1) { 206 while (1) {
@@ -262,8 +229,7 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
262 229
263static void hugetlbfs_delete_inode(struct inode *inode) 230static void hugetlbfs_delete_inode(struct inode *inode)
264{ 231{
265 if (inode->i_data.nrpages) 232 truncate_hugepages(inode, 0);
266 truncate_hugepages(&inode->i_data, 0);
267 clear_inode(inode); 233 clear_inode(inode);
268} 234}
269 235
@@ -296,8 +262,7 @@ static void hugetlbfs_forget_inode(struct inode *inode)
296 inode->i_state |= I_FREEING; 262 inode->i_state |= I_FREEING;
297 inodes_stat.nr_inodes--; 263 inodes_stat.nr_inodes--;
298 spin_unlock(&inode_lock); 264 spin_unlock(&inode_lock);
299 if (inode->i_data.nrpages) 265 truncate_hugepages(inode, 0);
300 truncate_hugepages(&inode->i_data, 0);
301 clear_inode(inode); 266 clear_inode(inode);
302 destroy_inode(inode); 267 destroy_inode(inode);
303} 268}
@@ -356,7 +321,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
356 if (!prio_tree_empty(&mapping->i_mmap)) 321 if (!prio_tree_empty(&mapping->i_mmap))
357 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); 322 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
358 spin_unlock(&mapping->i_mmap_lock); 323 spin_unlock(&mapping->i_mmap_lock);
359 truncate_hugepages(mapping, offset); 324 truncate_hugepages(inode, offset);
360 return 0; 325 return 0;
361} 326}
362 327
@@ -573,6 +538,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
573 hugetlbfs_inc_free_inodes(sbinfo); 538 hugetlbfs_inc_free_inodes(sbinfo);
574 return NULL; 539 return NULL;
575 } 540 }
541 p->prereserved_hpages = 0;
576 return &p->vfs_inode; 542 return &p->vfs_inode;
577} 543}
578 544
@@ -805,9 +771,6 @@ struct file *hugetlb_zero_setup(size_t size)
805 if (!can_do_hugetlb_shm()) 771 if (!can_do_hugetlb_shm())
806 return ERR_PTR(-EPERM); 772 return ERR_PTR(-EPERM);
807 773
808 if (!is_hugepage_mem_enough(size))
809 return ERR_PTR(-ENOMEM);
810
811 if (!user_shm_lock(size, current->user)) 774 if (!user_shm_lock(size, current->user))
812 return ERR_PTR(-ENOMEM); 775 return ERR_PTR(-ENOMEM);
813 776
@@ -831,6 +794,11 @@ struct file *hugetlb_zero_setup(size_t size)
831 if (!inode) 794 if (!inode)
832 goto out_file; 795 goto out_file;
833 796
797 error = -ENOMEM;
798 if (hugetlb_extend_reservation(HUGETLBFS_I(inode),
799 size >> HPAGE_SHIFT) != 0)
800 goto out_inode;
801
834 d_instantiate(dentry, inode); 802 d_instantiate(dentry, inode);
835 inode->i_size = size; 803 inode->i_size = size;
836 inode->i_nlink = 0; 804 inode->i_nlink = 0;
@@ -841,6 +809,8 @@ struct file *hugetlb_zero_setup(size_t size)
841 file->f_mode = FMODE_WRITE | FMODE_READ; 809 file->f_mode = FMODE_WRITE | FMODE_READ;
842 return file; 810 return file;
843 811
812out_inode:
813 iput(inode);
844out_file: 814out_file:
845 put_filp(file); 815 put_filp(file);
846out_dentry: 816out_dentry: