[PATCH] hugepage: Strict page reservation for hugepage inodes

These days, hugepages are demand-allocated at first fault time. There's a somewhat dubious (and racy) heuristic when making a new mmap() to check if there are enough available hugepages to fully satisfy that mapping. A particularly obvious case where the heuristic breaks down is where a process maps its hugepages not as a single chunk, but as a bunch of individually mmap()ed (or shmat()ed) blocks without touching and instantiating the pages in between allocations. In this case the size of each block is compared against the total number of available hugepages. It's thus easy for the process to become overcommitted, because each block mapping will succeed, although the total number of hugepages required by all blocks exceeds the number available. In particular, this defeats such a program which will detect a mapping failure and adjust its hugepage usage downward accordingly. The patch below addresses this problem, by strictly reserving a number of physical hugepages for hugepage inodes which have been mapped, but not instatiated. MAP_SHARED mappings are thus "safe" - they will fail on mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still trigger an OOM. (Actually SHARED mappings can technically still OOM, but only if the sysadmin explicitly reduces the hugepage pool between mapping and instantiation) This patch appears to address the problem at hand - it allows DB2 to start correctly, for instance, which previously suffered the failure described above. This patch causes no regressions on the libhugetblfs testsuite, and makes a test (designed to catch this problem) pass which previously failed (ppc64, POWER5). Signed-off-by: David Gibson <dwg@au1.ibm.com> Cc: William Lee Irwin III <wli@holomorphy.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: David Gibson <david@gibson.dropbear.id.au> 2006-03-22 03:08:55 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-03-22 10:54:03 -0500
commit: b45b5bd65f668a665db40d093e4e1fe563533608 (patch)
tree: aa3806bd87fd7aa719b561e4d468c779f6adb31b /mm
parent: 3935baa9bcda3ccaee4f7849f5157d316e34412e (diff)
1 files changed, 126 insertions, 10 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d5987a87bbe5..27fad5d9bcf6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -120,17 +120,136 @@ void free_huge_page(struct page *page)
 struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
+        struct inode *inode = vma->vm_file->f_dentry->d_inode;
        struct page *page;
+        int use_reserve = 0;
+        unsigned long idx;
        spin_lock(&hugetlb_lock);
-        page = dequeue_huge_page(vma, addr);
-        if (!page) {
+        if (vma->vm_flags & VM_MAYSHARE) {
-                spin_unlock(&hugetlb_lock);
-                return NULL;
+                /* idx = radix tree index, i.e. offset into file in
+                 * HPAGE_SIZE units */
+                idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+                        + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+                /* The hugetlbfs specific inode info stores the number
+                 * of "guaranteed available" (huge) pages.  That is,
+                 * the first 'prereserved_hpages' pages of the inode
+                 * are either already instantiated, or have been
+                 * pre-reserved (by hugetlb_reserve_for_inode()). Here
+                 * we're in the process of instantiating the page, so
+                 * we use this to determine whether to draw from the
+                 * pre-reserved pool or the truly free pool. */
+                if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
+                        use_reserve = 1;
+        }
+        if (!use_reserve) {
+                if (free_huge_pages <= reserved_huge_pages)
+                        goto fail;
+        } else {
+                BUG_ON(reserved_huge_pages == 0);
+                reserved_huge_pages--;
        }
+        page = dequeue_huge_page(vma, addr);
+        if (!page)
+                goto fail;
        spin_unlock(&hugetlb_lock);
        set_page_refcounted(page);
        return page;
+ fail:
+        WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+        spin_unlock(&hugetlb_lock);
+        return NULL;
+}
+/* hugetlb_extend_reservation()
+ *
+ * Ensure that at least 'atleast' hugepages are, and will remain,
+ * available to instantiate the first 'atleast' pages of the given
+ * inode.  If the inode doesn't already have this many pages reserved
+ * or instantiated, set aside some hugepages in the reserved pool to
+ * satisfy later faults (or fail now if there aren't enough, rather
+ * than getting the SIGBUS later).
+ */
+int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
+                               unsigned long atleast)
+{
+        struct inode *inode = &info->vfs_inode;
+        unsigned long change_in_reserve = 0;
+        int ret = 0;
+        spin_lock(&hugetlb_lock);
+        read_lock_irq(&inode->i_mapping->tree_lock);
+        if (info->prereserved_hpages >= atleast)
+                goto out;
+        /* Because we always call this on shared mappings, none of the
+         * pages beyond info->prereserved_hpages can have been
+         * instantiated, so we need to reserve all of them now. */
+        change_in_reserve = atleast - info->prereserved_hpages;
+        if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        reserved_huge_pages += change_in_reserve;
+        info->prereserved_hpages = atleast;
+ out:
+        read_unlock_irq(&inode->i_mapping->tree_lock);
+        spin_unlock(&hugetlb_lock);
+        return ret;
+}
+/* hugetlb_truncate_reservation()
+ *
+ * This returns pages reserved for the given inode to the general free
+ * hugepage pool.  If the inode has any pages prereserved, but not
+ * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
+ * them.
+ */
+void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
+                                  unsigned long atmost)
+{
+        struct inode *inode = &info->vfs_inode;
+        struct address_space *mapping = inode->i_mapping;
+        unsigned long idx;
+        unsigned long change_in_reserve = 0;
+        struct page *page;
+        spin_lock(&hugetlb_lock);
+        read_lock_irq(&inode->i_mapping->tree_lock);
+        if (info->prereserved_hpages <= atmost)
+                goto out;
+        /* Count pages which were reserved, but not instantiated, and
+         * which we can now release. */
+        for (idx = atmost; idx < info->prereserved_hpages; idx++) {
+                page = radix_tree_lookup(&mapping->page_tree, idx);
+                if (!page)
+                        /* Pages which are already instantiated can't
+                         * be unreserved (and in fact have already
+                         * been removed from the reserved pool) */
+                        change_in_reserve++;
+        }
+        BUG_ON(reserved_huge_pages < change_in_reserve);
+        reserved_huge_pages -= change_in_reserve;
+        info->prereserved_hpages = atmost;
+ out:
+        read_unlock_irq(&inode->i_mapping->tree_lock);
+        spin_unlock(&hugetlb_lock);
 }
 static int __init hugetlb_init(void)
@@ -238,9 +357,11 @@ int hugetlb_report_meminfo(char *buf)
        return sprintf(buf,
                        "HugePages_Total: %5lu\n"
                        "HugePages_Free:  %5lu\n"
+                        "HugePages_Rsvd:  %5lu\n"
                        "Hugepagesize:    %5lu kB\n",
                        nr_huge_pages,
                        free_huge_pages,
+                        reserved_huge_pages,
                        HPAGE_SIZE/1024);
 }
@@ -253,11 +374,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
                nid, free_huge_pages_node[nid]);
 }
-int is_hugepage_mem_enough(size_t size)
-{
-        return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
-}
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
author	David Gibson <david@gibson.dropbear.id.au>	2006-03-22 03:08:55 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-03-22 10:54:03 -0500
commit	b45b5bd65f668a665db40d093e4e1fe563533608 (patch)
tree	aa3806bd87fd7aa719b561e4d468c779f6adb31b /mm
parent	3935baa9bcda3ccaee4f7849f5157d316e34412e (diff)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d5987a87bbe5..27fad5d9bcf6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
22	#include "internal.h"	22	#include "internal.h"
23		23
24	const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;	24	const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25	static unsigned long nr_huge_pages, free_huge_pages;	25	static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
26	unsigned long max_huge_pages;	26	unsigned long max_huge_pages;
27	static struct list_head hugepage_freelists[MAX_NUMNODES];	27	static struct list_head hugepage_freelists[MAX_NUMNODES];
28	static unsigned int nr_huge_pages_node[MAX_NUMNODES];	28	static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -120,17 +120,136 @@ void free_huge_page(struct page *page)
120		120
121	struct page alloc_huge_page(struct vm_area_struct vma, unsigned long addr)	121	struct page alloc_huge_page(struct vm_area_struct vma, unsigned long addr)
122	{	122	{
		123	struct inode *inode = vma->vm_file->f_dentry->d_inode;
123	struct page *page;	124	struct page *page;
		125	int use_reserve = 0;
		126	unsigned long idx;
124		127
125	spin_lock(&hugetlb_lock);	128	spin_lock(&hugetlb_lock);
126	page = dequeue_huge_page(vma, addr);	129
127	if (!page) {	130	if (vma->vm_flags & VM_MAYSHARE) {
128	spin_unlock(&hugetlb_lock);	131
129	return NULL;	132	/* idx = radix tree index, i.e. offset into file in
		133	* HPAGE_SIZE units */
		134	idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
		135	+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
		136
		137	/* The hugetlbfs specific inode info stores the number
		138	* of "guaranteed available" (huge) pages. That is,
		139	* the first 'prereserved_hpages' pages of the inode
		140	* are either already instantiated, or have been
		141	* pre-reserved (by hugetlb_reserve_for_inode()). Here
		142	* we're in the process of instantiating the page, so
		143	* we use this to determine whether to draw from the
		144	* pre-reserved pool or the truly free pool. */
		145	if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
		146	use_reserve = 1;
		147	}
		148
		149	if (!use_reserve) {
		150	if (free_huge_pages <= reserved_huge_pages)
		151	goto fail;
		152	} else {
		153	BUG_ON(reserved_huge_pages == 0);
		154	reserved_huge_pages--;
130	}	155	}
		156
		157	page = dequeue_huge_page(vma, addr);
		158	if (!page)
		159	goto fail;
		160
131	spin_unlock(&hugetlb_lock);	161	spin_unlock(&hugetlb_lock);
132	set_page_refcounted(page);	162	set_page_refcounted(page);
133	return page;	163	return page;
		164
		165	fail:
		166	WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
		167	spin_unlock(&hugetlb_lock);
		168	return NULL;
		169	}
		170
		171	/* hugetlb_extend_reservation()
		172	*
		173	* Ensure that at least 'atleast' hugepages are, and will remain,
		174	* available to instantiate the first 'atleast' pages of the given
		175	* inode. If the inode doesn't already have this many pages reserved
		176	* or instantiated, set aside some hugepages in the reserved pool to
		177	* satisfy later faults (or fail now if there aren't enough, rather
		178	* than getting the SIGBUS later).
		179	*/
		180	int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
		181	unsigned long atleast)
		182	{
		183	struct inode *inode = &info->vfs_inode;
		184	unsigned long change_in_reserve = 0;
		185	int ret = 0;
		186
		187	spin_lock(&hugetlb_lock);
		188	read_lock_irq(&inode->i_mapping->tree_lock);
		189
		190	if (info->prereserved_hpages >= atleast)
		191	goto out;
		192
		193	/* Because we always call this on shared mappings, none of the
		194	* pages beyond info->prereserved_hpages can have been
		195	* instantiated, so we need to reserve all of them now. */
		196	change_in_reserve = atleast - info->prereserved_hpages;
		197
		198	if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
		199	ret = -ENOMEM;
		200	goto out;
		201	}
		202
		203	reserved_huge_pages += change_in_reserve;
		204	info->prereserved_hpages = atleast;
		205
		206	out:
		207	read_unlock_irq(&inode->i_mapping->tree_lock);
		208	spin_unlock(&hugetlb_lock);
		209
		210	return ret;
		211	}
		212
		213	/* hugetlb_truncate_reservation()
		214	*
		215	* This returns pages reserved for the given inode to the general free
		216	* hugepage pool. If the inode has any pages prereserved, but not
		217	* instantiated, beyond offset (atmost << HPAGE_SIZE), then release
		218	* them.
		219	*/
		220	void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
		221	unsigned long atmost)
		222	{
		223	struct inode *inode = &info->vfs_inode;
		224	struct address_space *mapping = inode->i_mapping;
		225	unsigned long idx;
		226	unsigned long change_in_reserve = 0;
		227	struct page *page;
		228
		229	spin_lock(&hugetlb_lock);
		230	read_lock_irq(&inode->i_mapping->tree_lock);
		231
		232	if (info->prereserved_hpages <= atmost)
		233	goto out;
		234
		235	/* Count pages which were reserved, but not instantiated, and
		236	* which we can now release. */
		237	for (idx = atmost; idx < info->prereserved_hpages; idx++) {
		238	page = radix_tree_lookup(&mapping->page_tree, idx);
		239	if (!page)
		240	/* Pages which are already instantiated can't
		241	* be unreserved (and in fact have already
		242	* been removed from the reserved pool) */
		243	change_in_reserve++;
		244	}
		245
		246	BUG_ON(reserved_huge_pages < change_in_reserve);
		247	reserved_huge_pages -= change_in_reserve;
		248	info->prereserved_hpages = atmost;
		249
		250	out:
		251	read_unlock_irq(&inode->i_mapping->tree_lock);
		252	spin_unlock(&hugetlb_lock);
134	}	253	}
135		254
136	static int __init hugetlb_init(void)	255	static int __init hugetlb_init(void)
@@ -238,9 +357,11 @@ int hugetlb_report_meminfo(char *buf)
238	return sprintf(buf,	357	return sprintf(buf,
239	"HugePages_Total: %5lu\n"	358	"HugePages_Total: %5lu\n"
240	"HugePages_Free: %5lu\n"	359	"HugePages_Free: %5lu\n"
		360	"HugePages_Rsvd: %5lu\n"
241	"Hugepagesize: %5lu kB\n",	361	"Hugepagesize: %5lu kB\n",
242	nr_huge_pages,	362	nr_huge_pages,
243	free_huge_pages,	363	free_huge_pages,
		364	reserved_huge_pages,
244	HPAGE_SIZE/1024);	365	HPAGE_SIZE/1024);
245	}	366	}
246		367
@@ -253,11 +374,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
253	nid, free_huge_pages_node[nid]);	374	nid, free_huge_pages_node[nid]);
254	}	375	}
255		376
256	int is_hugepage_mem_enough(size_t size)
257	{
258	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
259	}
260
261	/* Return the number pages of memory we physically have, in PAGE_SIZE units. */	377	/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
262	unsigned long hugetlb_total_pages(void)	378	unsigned long hugetlb_total_pages(void)
263	{	379	{