aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2006-03-22 03:08:55 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-22 10:54:03 -0500
commitb45b5bd65f668a665db40d093e4e1fe563533608 (patch)
treeaa3806bd87fd7aa719b561e4d468c779f6adb31b /mm
parent3935baa9bcda3ccaee4f7849f5157d316e34412e (diff)
[PATCH] hugepage: Strict page reservation for hugepage inodes
These days, hugepages are demand-allocated at first fault time. There's a somewhat dubious (and racy) heuristic when making a new mmap() to check if there are enough available hugepages to fully satisfy that mapping. A particularly obvious case where the heuristic breaks down is where a process maps its hugepages not as a single chunk, but as a bunch of individually mmap()ed (or shmat()ed) blocks without touching and instantiating the pages in between allocations. In this case the size of each block is compared against the total number of available hugepages. It's thus easy for the process to become overcommitted, because each block mapping will succeed, although the total number of hugepages required by all blocks exceeds the number available. In particular, this defeats such a program which will detect a mapping failure and adjust its hugepage usage downward accordingly. The patch below addresses this problem, by strictly reserving a number of physical hugepages for hugepage inodes which have been mapped, but not instatiated. MAP_SHARED mappings are thus "safe" - they will fail on mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still trigger an OOM. (Actually SHARED mappings can technically still OOM, but only if the sysadmin explicitly reduces the hugepage pool between mapping and instantiation) This patch appears to address the problem at hand - it allows DB2 to start correctly, for instance, which previously suffered the failure described above. This patch causes no regressions on the libhugetblfs testsuite, and makes a test (designed to catch this problem) pass which previously failed (ppc64, POWER5). Signed-off-by: David Gibson <dwg@au1.ibm.com> Cc: William Lee Irwin III <wli@holomorphy.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c136
1 files changed, 126 insertions, 10 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d5987a87bbe5..27fad5d9bcf6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
22#include "internal.h" 22#include "internal.h"
23 23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages; 25static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
26unsigned long max_huge_pages; 26unsigned long max_huge_pages;
27static struct list_head hugepage_freelists[MAX_NUMNODES]; 27static struct list_head hugepage_freelists[MAX_NUMNODES];
28static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 28static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -120,17 +120,136 @@ void free_huge_page(struct page *page)
120 120
121struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) 121struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
122{ 122{
123 struct inode *inode = vma->vm_file->f_dentry->d_inode;
123 struct page *page; 124 struct page *page;
125 int use_reserve = 0;
126 unsigned long idx;
124 127
125 spin_lock(&hugetlb_lock); 128 spin_lock(&hugetlb_lock);
126 page = dequeue_huge_page(vma, addr); 129
127 if (!page) { 130 if (vma->vm_flags & VM_MAYSHARE) {
128 spin_unlock(&hugetlb_lock); 131
129 return NULL; 132 /* idx = radix tree index, i.e. offset into file in
133 * HPAGE_SIZE units */
134 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
135 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
136
137 /* The hugetlbfs specific inode info stores the number
138 * of "guaranteed available" (huge) pages. That is,
139 * the first 'prereserved_hpages' pages of the inode
140 * are either already instantiated, or have been
141 * pre-reserved (by hugetlb_reserve_for_inode()). Here
142 * we're in the process of instantiating the page, so
143 * we use this to determine whether to draw from the
144 * pre-reserved pool or the truly free pool. */
145 if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
146 use_reserve = 1;
147 }
148
149 if (!use_reserve) {
150 if (free_huge_pages <= reserved_huge_pages)
151 goto fail;
152 } else {
153 BUG_ON(reserved_huge_pages == 0);
154 reserved_huge_pages--;
130 } 155 }
156
157 page = dequeue_huge_page(vma, addr);
158 if (!page)
159 goto fail;
160
131 spin_unlock(&hugetlb_lock); 161 spin_unlock(&hugetlb_lock);
132 set_page_refcounted(page); 162 set_page_refcounted(page);
133 return page; 163 return page;
164
165 fail:
166 WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
167 spin_unlock(&hugetlb_lock);
168 return NULL;
169}
170
171/* hugetlb_extend_reservation()
172 *
173 * Ensure that at least 'atleast' hugepages are, and will remain,
174 * available to instantiate the first 'atleast' pages of the given
175 * inode. If the inode doesn't already have this many pages reserved
176 * or instantiated, set aside some hugepages in the reserved pool to
177 * satisfy later faults (or fail now if there aren't enough, rather
178 * than getting the SIGBUS later).
179 */
180int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
181 unsigned long atleast)
182{
183 struct inode *inode = &info->vfs_inode;
184 unsigned long change_in_reserve = 0;
185 int ret = 0;
186
187 spin_lock(&hugetlb_lock);
188 read_lock_irq(&inode->i_mapping->tree_lock);
189
190 if (info->prereserved_hpages >= atleast)
191 goto out;
192
193 /* Because we always call this on shared mappings, none of the
194 * pages beyond info->prereserved_hpages can have been
195 * instantiated, so we need to reserve all of them now. */
196 change_in_reserve = atleast - info->prereserved_hpages;
197
198 if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
199 ret = -ENOMEM;
200 goto out;
201 }
202
203 reserved_huge_pages += change_in_reserve;
204 info->prereserved_hpages = atleast;
205
206 out:
207 read_unlock_irq(&inode->i_mapping->tree_lock);
208 spin_unlock(&hugetlb_lock);
209
210 return ret;
211}
212
213/* hugetlb_truncate_reservation()
214 *
215 * This returns pages reserved for the given inode to the general free
216 * hugepage pool. If the inode has any pages prereserved, but not
217 * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
218 * them.
219 */
220void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
221 unsigned long atmost)
222{
223 struct inode *inode = &info->vfs_inode;
224 struct address_space *mapping = inode->i_mapping;
225 unsigned long idx;
226 unsigned long change_in_reserve = 0;
227 struct page *page;
228
229 spin_lock(&hugetlb_lock);
230 read_lock_irq(&inode->i_mapping->tree_lock);
231
232 if (info->prereserved_hpages <= atmost)
233 goto out;
234
235 /* Count pages which were reserved, but not instantiated, and
236 * which we can now release. */
237 for (idx = atmost; idx < info->prereserved_hpages; idx++) {
238 page = radix_tree_lookup(&mapping->page_tree, idx);
239 if (!page)
240 /* Pages which are already instantiated can't
241 * be unreserved (and in fact have already
242 * been removed from the reserved pool) */
243 change_in_reserve++;
244 }
245
246 BUG_ON(reserved_huge_pages < change_in_reserve);
247 reserved_huge_pages -= change_in_reserve;
248 info->prereserved_hpages = atmost;
249
250 out:
251 read_unlock_irq(&inode->i_mapping->tree_lock);
252 spin_unlock(&hugetlb_lock);
134} 253}
135 254
136static int __init hugetlb_init(void) 255static int __init hugetlb_init(void)
@@ -238,9 +357,11 @@ int hugetlb_report_meminfo(char *buf)
238 return sprintf(buf, 357 return sprintf(buf,
239 "HugePages_Total: %5lu\n" 358 "HugePages_Total: %5lu\n"
240 "HugePages_Free: %5lu\n" 359 "HugePages_Free: %5lu\n"
360 "HugePages_Rsvd: %5lu\n"
241 "Hugepagesize: %5lu kB\n", 361 "Hugepagesize: %5lu kB\n",
242 nr_huge_pages, 362 nr_huge_pages,
243 free_huge_pages, 363 free_huge_pages,
364 reserved_huge_pages,
244 HPAGE_SIZE/1024); 365 HPAGE_SIZE/1024);
245} 366}
246 367
@@ -253,11 +374,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
253 nid, free_huge_pages_node[nid]); 374 nid, free_huge_pages_node[nid]);
254} 375}
255 376
256int is_hugepage_mem_enough(size_t size)
257{
258 return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
259}
260
261/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 377/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
262unsigned long hugetlb_total_pages(void) 378unsigned long hugetlb_total_pages(void)
263{ 379{