aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2006-03-22 03:08:55 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-22 10:54:03 -0500
commitb45b5bd65f668a665db40d093e4e1fe563533608 (patch)
treeaa3806bd87fd7aa719b561e4d468c779f6adb31b
parent3935baa9bcda3ccaee4f7849f5157d316e34412e (diff)
[PATCH] hugepage: Strict page reservation for hugepage inodes
These days, hugepages are demand-allocated at first fault time. There's a somewhat dubious (and racy) heuristic when making a new mmap() to check if there are enough available hugepages to fully satisfy that mapping. A particularly obvious case where the heuristic breaks down is where a process maps its hugepages not as a single chunk, but as a bunch of individually mmap()ed (or shmat()ed) blocks without touching and instantiating the pages in between allocations. In this case the size of each block is compared against the total number of available hugepages. It's thus easy for the process to become overcommitted, because each block mapping will succeed, although the total number of hugepages required by all blocks exceeds the number available. In particular, this defeats such a program which will detect a mapping failure and adjust its hugepage usage downward accordingly. The patch below addresses this problem, by strictly reserving a number of physical hugepages for hugepage inodes which have been mapped, but not instatiated. MAP_SHARED mappings are thus "safe" - they will fail on mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still trigger an OOM. (Actually SHARED mappings can technically still OOM, but only if the sysadmin explicitly reduces the hugepage pool between mapping and instantiation) This patch appears to address the problem at hand - it allows DB2 to start correctly, for instance, which previously suffered the failure described above. This patch causes no regressions on the libhugetblfs testsuite, and makes a test (designed to catch this problem) pass which previously failed (ppc64, POWER5). Signed-off-by: David Gibson <dwg@au1.ibm.com> Cc: William Lee Irwin III <wli@holomorphy.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--fs/hugetlbfs/inode.c74
-rw-r--r--include/linux/hugetlb.h8
-rw-r--r--mm/hugetlb.c136
3 files changed, 154 insertions, 64 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b35195289945..1a1c2fcb7823 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -56,48 +56,10 @@ static void huge_pagevec_release(struct pagevec *pvec)
56 pagevec_reinit(pvec); 56 pagevec_reinit(pvec);
57} 57}
58 58
59/*
60 * huge_pages_needed tries to determine the number of new huge pages that
61 * will be required to fully populate this VMA. This will be equal to
62 * the size of the VMA in huge pages minus the number of huge pages
63 * (covered by this VMA) that are found in the page cache.
64 *
65 * Result is in bytes to be compatible with is_hugepage_mem_enough()
66 */
67static unsigned long
68huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
69{
70 int i;
71 struct pagevec pvec;
72 unsigned long start = vma->vm_start;
73 unsigned long end = vma->vm_end;
74 unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
75 pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
76 pgoff_t endpg = next + hugepages;
77
78 pagevec_init(&pvec, 0);
79 while (next < endpg) {
80 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
81 break;
82 for (i = 0; i < pagevec_count(&pvec); i++) {
83 struct page *page = pvec.pages[i];
84 if (page->index > next)
85 next = page->index;
86 if (page->index >= endpg)
87 break;
88 next++;
89 hugepages--;
90 }
91 huge_pagevec_release(&pvec);
92 }
93 return hugepages << HPAGE_SHIFT;
94}
95
96static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 59static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
97{ 60{
98 struct inode *inode = file->f_dentry->d_inode; 61 struct inode *inode = file->f_dentry->d_inode;
99 struct address_space *mapping = inode->i_mapping; 62 struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
100 unsigned long bytes;
101 loff_t len, vma_len; 63 loff_t len, vma_len;
102 int ret; 64 int ret;
103 65
@@ -113,10 +75,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
113 if (vma->vm_end - vma->vm_start < HPAGE_SIZE) 75 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
114 return -EINVAL; 76 return -EINVAL;
115 77
116 bytes = huge_pages_needed(mapping, vma);
117 if (!is_hugepage_mem_enough(bytes))
118 return -ENOMEM;
119
120 vma_len = (loff_t)(vma->vm_end - vma->vm_start); 78 vma_len = (loff_t)(vma->vm_end - vma->vm_start);
121 79
122 mutex_lock(&inode->i_mutex); 80 mutex_lock(&inode->i_mutex);
@@ -129,6 +87,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
129 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) 87 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
130 goto out; 88 goto out;
131 89
90 if (vma->vm_flags & VM_MAYSHARE)
91 if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)
92 goto out;
93
132 ret = 0; 94 ret = 0;
133 hugetlb_prefault_arch_hook(vma->vm_mm); 95 hugetlb_prefault_arch_hook(vma->vm_mm);
134 if (inode->i_size < len) 96 if (inode->i_size < len)
@@ -227,13 +189,18 @@ static void truncate_huge_page(struct page *page)
227 put_page(page); 189 put_page(page);
228} 190}
229 191
230static void truncate_hugepages(struct address_space *mapping, loff_t lstart) 192static void truncate_hugepages(struct inode *inode, loff_t lstart)
231{ 193{
194 struct address_space *mapping = &inode->i_data;
232 const pgoff_t start = lstart >> HPAGE_SHIFT; 195 const pgoff_t start = lstart >> HPAGE_SHIFT;
233 struct pagevec pvec; 196 struct pagevec pvec;
234 pgoff_t next; 197 pgoff_t next;
235 int i; 198 int i;
236 199
200 hugetlb_truncate_reservation(HUGETLBFS_I(inode),
201 lstart >> HPAGE_SHIFT);
202 if (!mapping->nrpages)
203 return;
237 pagevec_init(&pvec, 0); 204 pagevec_init(&pvec, 0);
238 next = start; 205 next = start;
239 while (1) { 206 while (1) {
@@ -262,8 +229,7 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
262 229
263static void hugetlbfs_delete_inode(struct inode *inode) 230static void hugetlbfs_delete_inode(struct inode *inode)
264{ 231{
265 if (inode->i_data.nrpages) 232 truncate_hugepages(inode, 0);
266 truncate_hugepages(&inode->i_data, 0);
267 clear_inode(inode); 233 clear_inode(inode);
268} 234}
269 235
@@ -296,8 +262,7 @@ static void hugetlbfs_forget_inode(struct inode *inode)
296 inode->i_state |= I_FREEING; 262 inode->i_state |= I_FREEING;
297 inodes_stat.nr_inodes--; 263 inodes_stat.nr_inodes--;
298 spin_unlock(&inode_lock); 264 spin_unlock(&inode_lock);
299 if (inode->i_data.nrpages) 265 truncate_hugepages(inode, 0);
300 truncate_hugepages(&inode->i_data, 0);
301 clear_inode(inode); 266 clear_inode(inode);
302 destroy_inode(inode); 267 destroy_inode(inode);
303} 268}
@@ -356,7 +321,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
356 if (!prio_tree_empty(&mapping->i_mmap)) 321 if (!prio_tree_empty(&mapping->i_mmap))
357 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); 322 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
358 spin_unlock(&mapping->i_mmap_lock); 323 spin_unlock(&mapping->i_mmap_lock);
359 truncate_hugepages(mapping, offset); 324 truncate_hugepages(inode, offset);
360 return 0; 325 return 0;
361} 326}
362 327
@@ -573,6 +538,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
573 hugetlbfs_inc_free_inodes(sbinfo); 538 hugetlbfs_inc_free_inodes(sbinfo);
574 return NULL; 539 return NULL;
575 } 540 }
541 p->prereserved_hpages = 0;
576 return &p->vfs_inode; 542 return &p->vfs_inode;
577} 543}
578 544
@@ -805,9 +771,6 @@ struct file *hugetlb_zero_setup(size_t size)
805 if (!can_do_hugetlb_shm()) 771 if (!can_do_hugetlb_shm())
806 return ERR_PTR(-EPERM); 772 return ERR_PTR(-EPERM);
807 773
808 if (!is_hugepage_mem_enough(size))
809 return ERR_PTR(-ENOMEM);
810
811 if (!user_shm_lock(size, current->user)) 774 if (!user_shm_lock(size, current->user))
812 return ERR_PTR(-ENOMEM); 775 return ERR_PTR(-ENOMEM);
813 776
@@ -831,6 +794,11 @@ struct file *hugetlb_zero_setup(size_t size)
831 if (!inode) 794 if (!inode)
832 goto out_file; 795 goto out_file;
833 796
797 error = -ENOMEM;
798 if (hugetlb_extend_reservation(HUGETLBFS_I(inode),
799 size >> HPAGE_SHIFT) != 0)
800 goto out_inode;
801
834 d_instantiate(dentry, inode); 802 d_instantiate(dentry, inode);
835 inode->i_size = size; 803 inode->i_size = size;
836 inode->i_nlink = 0; 804 inode->i_nlink = 0;
@@ -841,6 +809,8 @@ struct file *hugetlb_zero_setup(size_t size)
841 file->f_mode = FMODE_WRITE | FMODE_READ; 809 file->f_mode = FMODE_WRITE | FMODE_READ;
842 return file; 810 return file;
843 811
812out_inode:
813 iput(inode);
844out_file: 814out_file:
845 put_filp(file); 815 put_filp(file);
846out_dentry: 816out_dentry:
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index fa83836b63d2..cafe73eecb05 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,7 +20,6 @@ void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long)
20int hugetlb_prefault(struct address_space *, struct vm_area_struct *); 20int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
21int hugetlb_report_meminfo(char *); 21int hugetlb_report_meminfo(char *);
22int hugetlb_report_node_meminfo(int, char *); 22int hugetlb_report_node_meminfo(int, char *);
23int is_hugepage_mem_enough(size_t);
24unsigned long hugetlb_total_pages(void); 23unsigned long hugetlb_total_pages(void);
25struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); 24struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
26void free_huge_page(struct page *); 25void free_huge_page(struct page *);
@@ -89,7 +88,6 @@ static inline unsigned long hugetlb_total_pages(void)
89#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) 88#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
90#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) 89#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
91#define unmap_hugepage_range(vma, start, end) BUG() 90#define unmap_hugepage_range(vma, start, end) BUG()
92#define is_hugepage_mem_enough(size) 0
93#define hugetlb_report_meminfo(buf) 0 91#define hugetlb_report_meminfo(buf) 0
94#define hugetlb_report_node_meminfo(n, buf) 0 92#define hugetlb_report_node_meminfo(n, buf) 0
95#define follow_huge_pmd(mm, addr, pmd, write) NULL 93#define follow_huge_pmd(mm, addr, pmd, write) NULL
@@ -132,6 +130,8 @@ struct hugetlbfs_sb_info {
132 130
133struct hugetlbfs_inode_info { 131struct hugetlbfs_inode_info {
134 struct shared_policy policy; 132 struct shared_policy policy;
133 /* Protected by the (global) hugetlb_lock */
134 unsigned long prereserved_hpages;
135 struct inode vfs_inode; 135 struct inode vfs_inode;
136}; 136};
137 137
@@ -148,6 +148,10 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
148extern struct file_operations hugetlbfs_file_operations; 148extern struct file_operations hugetlbfs_file_operations;
149extern struct vm_operations_struct hugetlb_vm_ops; 149extern struct vm_operations_struct hugetlb_vm_ops;
150struct file *hugetlb_zero_setup(size_t); 150struct file *hugetlb_zero_setup(size_t);
151int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
152 unsigned long atleast_hpages);
153void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
154 unsigned long atmost_hpages);
151int hugetlb_get_quota(struct address_space *mapping); 155int hugetlb_get_quota(struct address_space *mapping);
152void hugetlb_put_quota(struct address_space *mapping); 156void hugetlb_put_quota(struct address_space *mapping);
153 157
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d5987a87bbe5..27fad5d9bcf6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
22#include "internal.h" 22#include "internal.h"
23 23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages; 25static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
26unsigned long max_huge_pages; 26unsigned long max_huge_pages;
27static struct list_head hugepage_freelists[MAX_NUMNODES]; 27static struct list_head hugepage_freelists[MAX_NUMNODES];
28static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 28static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -120,17 +120,136 @@ void free_huge_page(struct page *page)
120 120
121struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) 121struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
122{ 122{
123 struct inode *inode = vma->vm_file->f_dentry->d_inode;
123 struct page *page; 124 struct page *page;
125 int use_reserve = 0;
126 unsigned long idx;
124 127
125 spin_lock(&hugetlb_lock); 128 spin_lock(&hugetlb_lock);
126 page = dequeue_huge_page(vma, addr); 129
127 if (!page) { 130 if (vma->vm_flags & VM_MAYSHARE) {
128 spin_unlock(&hugetlb_lock); 131
129 return NULL; 132 /* idx = radix tree index, i.e. offset into file in
133 * HPAGE_SIZE units */
134 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
135 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
136
137 /* The hugetlbfs specific inode info stores the number
138 * of "guaranteed available" (huge) pages. That is,
139 * the first 'prereserved_hpages' pages of the inode
140 * are either already instantiated, or have been
141 * pre-reserved (by hugetlb_reserve_for_inode()). Here
142 * we're in the process of instantiating the page, so
143 * we use this to determine whether to draw from the
144 * pre-reserved pool or the truly free pool. */
145 if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
146 use_reserve = 1;
147 }
148
149 if (!use_reserve) {
150 if (free_huge_pages <= reserved_huge_pages)
151 goto fail;
152 } else {
153 BUG_ON(reserved_huge_pages == 0);
154 reserved_huge_pages--;
130 } 155 }
156
157 page = dequeue_huge_page(vma, addr);
158 if (!page)
159 goto fail;
160
131 spin_unlock(&hugetlb_lock); 161 spin_unlock(&hugetlb_lock);
132 set_page_refcounted(page); 162 set_page_refcounted(page);
133 return page; 163 return page;
164
165 fail:
166 WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
167 spin_unlock(&hugetlb_lock);
168 return NULL;
169}
170
171/* hugetlb_extend_reservation()
172 *
173 * Ensure that at least 'atleast' hugepages are, and will remain,
174 * available to instantiate the first 'atleast' pages of the given
175 * inode. If the inode doesn't already have this many pages reserved
176 * or instantiated, set aside some hugepages in the reserved pool to
177 * satisfy later faults (or fail now if there aren't enough, rather
178 * than getting the SIGBUS later).
179 */
180int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
181 unsigned long atleast)
182{
183 struct inode *inode = &info->vfs_inode;
184 unsigned long change_in_reserve = 0;
185 int ret = 0;
186
187 spin_lock(&hugetlb_lock);
188 read_lock_irq(&inode->i_mapping->tree_lock);
189
190 if (info->prereserved_hpages >= atleast)
191 goto out;
192
193 /* Because we always call this on shared mappings, none of the
194 * pages beyond info->prereserved_hpages can have been
195 * instantiated, so we need to reserve all of them now. */
196 change_in_reserve = atleast - info->prereserved_hpages;
197
198 if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
199 ret = -ENOMEM;
200 goto out;
201 }
202
203 reserved_huge_pages += change_in_reserve;
204 info->prereserved_hpages = atleast;
205
206 out:
207 read_unlock_irq(&inode->i_mapping->tree_lock);
208 spin_unlock(&hugetlb_lock);
209
210 return ret;
211}
212
213/* hugetlb_truncate_reservation()
214 *
215 * This returns pages reserved for the given inode to the general free
216 * hugepage pool. If the inode has any pages prereserved, but not
217 * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
218 * them.
219 */
220void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
221 unsigned long atmost)
222{
223 struct inode *inode = &info->vfs_inode;
224 struct address_space *mapping = inode->i_mapping;
225 unsigned long idx;
226 unsigned long change_in_reserve = 0;
227 struct page *page;
228
229 spin_lock(&hugetlb_lock);
230 read_lock_irq(&inode->i_mapping->tree_lock);
231
232 if (info->prereserved_hpages <= atmost)
233 goto out;
234
235 /* Count pages which were reserved, but not instantiated, and
236 * which we can now release. */
237 for (idx = atmost; idx < info->prereserved_hpages; idx++) {
238 page = radix_tree_lookup(&mapping->page_tree, idx);
239 if (!page)
240 /* Pages which are already instantiated can't
241 * be unreserved (and in fact have already
242 * been removed from the reserved pool) */
243 change_in_reserve++;
244 }
245
246 BUG_ON(reserved_huge_pages < change_in_reserve);
247 reserved_huge_pages -= change_in_reserve;
248 info->prereserved_hpages = atmost;
249
250 out:
251 read_unlock_irq(&inode->i_mapping->tree_lock);
252 spin_unlock(&hugetlb_lock);
134} 253}
135 254
136static int __init hugetlb_init(void) 255static int __init hugetlb_init(void)
@@ -238,9 +357,11 @@ int hugetlb_report_meminfo(char *buf)
238 return sprintf(buf, 357 return sprintf(buf,
239 "HugePages_Total: %5lu\n" 358 "HugePages_Total: %5lu\n"
240 "HugePages_Free: %5lu\n" 359 "HugePages_Free: %5lu\n"
360 "HugePages_Rsvd: %5lu\n"
241 "Hugepagesize: %5lu kB\n", 361 "Hugepagesize: %5lu kB\n",
242 nr_huge_pages, 362 nr_huge_pages,
243 free_huge_pages, 363 free_huge_pages,
364 reserved_huge_pages,
244 HPAGE_SIZE/1024); 365 HPAGE_SIZE/1024);
245} 366}
246 367
@@ -253,11 +374,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
253 nid, free_huge_pages_node[nid]); 374 nid, free_huge_pages_node[nid]);
254} 375}
255 376
256int is_hugepage_mem_enough(size_t size)
257{
258 return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
259}
260
261/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 377/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
262unsigned long hugetlb_total_pages(void) 378unsigned long hugetlb_total_pages(void)
263{ 379{