hugetlb: Try to grow hugetlb pool for MAP_PRIVATE mappings

Because we overcommit hugepages for MAP_PRIVATE mappings, it is possible that the hugetlb pool will be exhausted or completely reserved when a hugepage is needed to satisfy a page fault. Before killing the process in this situation, try to allocate a hugepage directly from the buddy allocator. The explicitly configured pool size becomes a low watermark. When dynamically grown, the allocated huge pages are accounted as a surplus over the watermark. As huge pages are freed on a node, surplus pages are released to the buddy allocator so that the pool will shrink back to the watermark. Surplus accounting also allows for friendlier explicit pool resizing. When shrinking a pool that is fully in-use, increase the surplus so pages will be returned to the buddy allocator as soon as they are freed. When growing a pool that has a surplus, consume the surplus first and then allocate new pages. Signed-off-by: Adam Litke <agl@us.ibm.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Andy Whitcroft <apw@shadowen.org> Acked-by: Dave McCracken <dave.mccracken@oracle.com> Cc: William Irwin <bill.irwin@oracle.com> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: Ken Chen <kenchen@google.com> Cc: Badari Pulavarty <pbadari@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Adam Litke <agl@us.ibm.com> 2007-10-16 04:26:18 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-16 12:43:02 -0400
commit: 7893d1d505d59db9d4f35165c8b6d3c6dff40a32 (patch)
tree: 6bea3b41e111b1d1774980296a032012a3926e9c /mm/hugetlb.c
parent: 6af2acb6619688046039234f716fd003e6ed2b3f (diff)
1 files changed, 125 insertions, 14 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ba029d64074..8768e525032 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -23,10 +23,12 @@
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
+static unsigned long surplus_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
+static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
@@ -109,15 +111,57 @@ static void update_and_free_page(struct page *page)
 static void free_huge_page(struct page *page)
 {
-        BUG_ON(page_count(page));
+        int nid = page_to_nid(page);
+        BUG_ON(page_count(page));
        INIT_LIST_HEAD(&page->lru);
        spin_lock(&hugetlb_lock);
-        enqueue_huge_page(page);
+        if (surplus_huge_pages_node[nid]) {
+                update_and_free_page(page);
+                surplus_huge_pages--;
+                surplus_huge_pages_node[nid]--;
+        } else {
+                enqueue_huge_page(page);
+        }
        spin_unlock(&hugetlb_lock);
 }
+/*
+ * Increment or decrement surplus_huge_pages.  Keep node-specific counters
+ * balanced by operating on them in a round-robin fashion.
+ * Returns 1 if an adjustment was made.
+ */
+static int adjust_pool_surplus(int delta)
+{
+        static int prev_nid;
+        int nid = prev_nid;
+        int ret = 0;
+        VM_BUG_ON(delta != -1 && delta != 1);
+        do {
+                nid = next_node(nid, node_online_map);
+                if (nid == MAX_NUMNODES)
+                        nid = first_node(node_online_map);
+                /* To shrink on this node, there must be a surplus page */
+                if (delta < 0 && !surplus_huge_pages_node[nid])
+                        continue;
+                /* Surplus cannot exceed the total number of pages */
+                if (delta > 0 && surplus_huge_pages_node[nid] >=
+                                                nr_huge_pages_node[nid])
+                        continue;
+                surplus_huge_pages += delta;
+                surplus_huge_pages_node[nid] += delta;
+                ret = 1;
+                break;
+        } while (nid != prev_nid);
+        prev_nid = nid;
+        return ret;
+}
 static int alloc_fresh_huge_page(void)
 {
        static int prev_nid;
@@ -150,10 +194,30 @@ static int alloc_fresh_huge_page(void)
        return 0;
 }
+static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
+                                                unsigned long address)
+{
+        struct page *page;
+        page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
+                                        HUGETLB_PAGE_ORDER);
+        if (page) {
+                set_compound_page_dtor(page, free_huge_page);
+                spin_lock(&hugetlb_lock);
+                nr_huge_pages++;
+                nr_huge_pages_node[page_to_nid(page)]++;
+                surplus_huge_pages++;
+                surplus_huge_pages_node[page_to_nid(page)]++;
+                spin_unlock(&hugetlb_lock);
+        }
+        return page;
+}
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr)
 {
-        struct page *page;
+        struct page *page = NULL;
        spin_lock(&hugetlb_lock);
        if (vma->vm_flags & VM_MAYSHARE)
@@ -173,7 +237,16 @@ fail:
        if (vma->vm_flags & VM_MAYSHARE)
                resv_huge_pages++;
        spin_unlock(&hugetlb_lock);
-        return NULL;
+        /*
+         * Private mappings do not use reserved huge pages so the allocation
+         * may have failed due to an undersized hugetlb pool.  Try to grab a
+         * surplus huge page from the buddy allocator.
+         */
+        if (!(vma->vm_flags & VM_MAYSHARE))
+                page = alloc_buddy_huge_page(vma, addr);
+        return page;
 }
 static int __init hugetlb_init(void)
@@ -241,26 +314,62 @@ static inline void try_to_free_low(unsigned long count)
 }
 #endif
+#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
-        while (count > nr_huge_pages) {
+        unsigned long min_count, ret;
-                if (!alloc_fresh_huge_page())
-                        return nr_huge_pages;
-        }
-        if (count >= nr_huge_pages)
-                return nr_huge_pages;
+        /*
+         * Increase the pool size
+         * First take pages out of surplus state.  Then make up the
+         * remaining difference by allocating fresh huge pages.
+         */
        spin_lock(&hugetlb_lock);
-        count = max(count, resv_huge_pages);
+        while (surplus_huge_pages && count > persistent_huge_pages) {
-        try_to_free_low(count);
+                if (!adjust_pool_surplus(-1))
-        while (count < nr_huge_pages) {
+                        break;
+        }
+        while (count > persistent_huge_pages) {
+                int ret;
+                /*
+                 * If this allocation races such that we no longer need the
+                 * page, free_huge_page will handle it by freeing the page
+                 * and reducing the surplus.
+                 */
+                spin_unlock(&hugetlb_lock);
+                ret = alloc_fresh_huge_page();
+                spin_lock(&hugetlb_lock);
+                if (!ret)
+                        goto out;
+        }
+        if (count >= persistent_huge_pages)
+                goto out;
+        /*
+         * Decrease the pool size
+         * First return free pages to the buddy allocator (being careful
+         * to keep enough around to satisfy reservations).  Then place
+         * pages into surplus state as needed so the pool will shrink
+         * to the desired size as pages become free.
+         */
+        min_count = max(count, resv_huge_pages);
+        try_to_free_low(min_count);
+        while (min_count < persistent_huge_pages) {
                struct page *page = dequeue_huge_page(NULL, 0);
                if (!page)
                        break;
                update_and_free_page(page);
        }
+        while (count < persistent_huge_pages) {
+                if (!adjust_pool_surplus(1))
+                        break;
+        }
+out:
+        ret = persistent_huge_pages;
        spin_unlock(&hugetlb_lock);
-        return nr_huge_pages;
+        return ret;
 }
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -292,10 +401,12 @@ int hugetlb_report_meminfo(char *buf)
                        "HugePages_Total: %5lu\n"
                        "HugePages_Free:  %5lu\n"
                        "HugePages_Rsvd:  %5lu\n"
+                        "HugePages_Surp:  %5lu\n"
                        "Hugepagesize:    %5lu kB\n",
                        nr_huge_pages,
                        free_huge_pages,
                        resv_huge_pages,
+                        surplus_huge_pages,
                        HPAGE_SIZE/1024);
 }
author	Adam Litke <agl@us.ibm.com>	2007-10-16 04:26:18 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-16 12:43:02 -0400
commit	7893d1d505d59db9d4f35165c8b6d3c6dff40a32 (patch)
tree	6bea3b41e111b1d1774980296a032012a3926e9c /mm/hugetlb.c
parent	6af2acb6619688046039234f716fd003e6ed2b3f (diff)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ba029d64074..8768e525032 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -23,10 +23,12 @@
23		23
24	const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;	24	const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25	static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;	25	static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
		26	static unsigned long surplus_huge_pages;
26	unsigned long max_huge_pages;	27	unsigned long max_huge_pages;
27	static struct list_head hugepage_freelists[MAX_NUMNODES];	28	static struct list_head hugepage_freelists[MAX_NUMNODES];
28	static unsigned int nr_huge_pages_node[MAX_NUMNODES];	29	static unsigned int nr_huge_pages_node[MAX_NUMNODES];
29	static unsigned int free_huge_pages_node[MAX_NUMNODES];	30	static unsigned int free_huge_pages_node[MAX_NUMNODES];
		31	static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
30	static gfp_t htlb_alloc_mask = GFP_HIGHUSER;	32	static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
31	unsigned long hugepages_treat_as_movable;	33	unsigned long hugepages_treat_as_movable;
32		34
@@ -109,15 +111,57 @@ static void update_and_free_page(struct page *page)
109		111
110	static void free_huge_page(struct page *page)	112	static void free_huge_page(struct page *page)
111	{	113	{
112	BUG_ON(page_count(page));	114	int nid = page_to_nid(page);
113		115
		116	BUG_ON(page_count(page));
114	INIT_LIST_HEAD(&page->lru);	117	INIT_LIST_HEAD(&page->lru);
115		118
116	spin_lock(&hugetlb_lock);	119	spin_lock(&hugetlb_lock);
117	enqueue_huge_page(page);	120	if (surplus_huge_pages_node[nid]) {
		121	update_and_free_page(page);
		122	surplus_huge_pages--;
		123	surplus_huge_pages_node[nid]--;
		124	} else {
		125	enqueue_huge_page(page);
		126	}
118	spin_unlock(&hugetlb_lock);	127	spin_unlock(&hugetlb_lock);
119	}	128	}
120		129
		130	/*
		131	* Increment or decrement surplus_huge_pages. Keep node-specific counters
		132	* balanced by operating on them in a round-robin fashion.
		133	* Returns 1 if an adjustment was made.
		134	*/
		135	static int adjust_pool_surplus(int delta)
		136	{
		137	static int prev_nid;
		138	int nid = prev_nid;
		139	int ret = 0;
		140
		141	VM_BUG_ON(delta != -1 && delta != 1);
		142	do {
		143	nid = next_node(nid, node_online_map);
		144	if (nid == MAX_NUMNODES)
		145	nid = first_node(node_online_map);
		146
		147	/* To shrink on this node, there must be a surplus page */
		148	if (delta < 0 && !surplus_huge_pages_node[nid])
		149	continue;
		150	/* Surplus cannot exceed the total number of pages */
		151	if (delta > 0 && surplus_huge_pages_node[nid] >=
		152	nr_huge_pages_node[nid])
		153	continue;
		154
		155	surplus_huge_pages += delta;
		156	surplus_huge_pages_node[nid] += delta;
		157	ret = 1;
		158	break;
		159	} while (nid != prev_nid);
		160
		161	prev_nid = nid;
		162	return ret;
		163	}
		164
121	static int alloc_fresh_huge_page(void)	165	static int alloc_fresh_huge_page(void)
122	{	166	{
123	static int prev_nid;	167	static int prev_nid;
@@ -150,10 +194,30 @@ static int alloc_fresh_huge_page(void)
150	return 0;	194	return 0;
151	}	195	}
152		196
		197	static struct page alloc_buddy_huge_page(struct vm_area_struct vma,
		198	unsigned long address)
		199	{
		200	struct page *page;
		201
		202	page = alloc_pages(htlb_alloc_mask\|__GFP_COMP\|__GFP_NOWARN,
		203	HUGETLB_PAGE_ORDER);
		204	if (page) {
		205	set_compound_page_dtor(page, free_huge_page);
		206	spin_lock(&hugetlb_lock);
		207	nr_huge_pages++;
		208	nr_huge_pages_node[page_to_nid(page)]++;
		209	surplus_huge_pages++;
		210	surplus_huge_pages_node[page_to_nid(page)]++;
		211	spin_unlock(&hugetlb_lock);
		212	}
		213
		214	return page;
		215	}
		216
153	static struct page alloc_huge_page(struct vm_area_struct vma,	217	static struct page alloc_huge_page(struct vm_area_struct vma,
154	unsigned long addr)	218	unsigned long addr)
155	{	219	{
156	struct page *page;	220	struct page *page = NULL;
157		221
158	spin_lock(&hugetlb_lock);	222	spin_lock(&hugetlb_lock);
159	if (vma->vm_flags & VM_MAYSHARE)	223	if (vma->vm_flags & VM_MAYSHARE)
@@ -173,7 +237,16 @@ fail:
173	if (vma->vm_flags & VM_MAYSHARE)	237	if (vma->vm_flags & VM_MAYSHARE)
174	resv_huge_pages++;	238	resv_huge_pages++;
175	spin_unlock(&hugetlb_lock);	239	spin_unlock(&hugetlb_lock);
176	return NULL;	240
		241	/*
		242	* Private mappings do not use reserved huge pages so the allocation
		243	* may have failed due to an undersized hugetlb pool. Try to grab a
		244	* surplus huge page from the buddy allocator.
		245	*/
		246	if (!(vma->vm_flags & VM_MAYSHARE))
		247	page = alloc_buddy_huge_page(vma, addr);
		248
		249	return page;
177	}	250	}
178		251
179	static int __init hugetlb_init(void)	252	static int __init hugetlb_init(void)
@@ -241,26 +314,62 @@ static inline void try_to_free_low(unsigned long count)
241	}	314	}
242	#endif	315	#endif
243		316
		317	#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
244	static unsigned long set_max_huge_pages(unsigned long count)	318	static unsigned long set_max_huge_pages(unsigned long count)
245	{	319	{
246	while (count > nr_huge_pages) {	320	unsigned long min_count, ret;
247	if (!alloc_fresh_huge_page())
248	return nr_huge_pages;
249	}
250	if (count >= nr_huge_pages)
251	return nr_huge_pages;
252		321
		322	/*
		323	* Increase the pool size
		324	* First take pages out of surplus state. Then make up the
		325	* remaining difference by allocating fresh huge pages.
		326	*/
253	spin_lock(&hugetlb_lock);	327	spin_lock(&hugetlb_lock);
254	count = max(count, resv_huge_pages);	328	while (surplus_huge_pages && count > persistent_huge_pages) {
255	try_to_free_low(count);	329	if (!adjust_pool_surplus(-1))
256	while (count < nr_huge_pages) {	330	break;
		331	}
		332
		333	while (count > persistent_huge_pages) {
		334	int ret;
		335	/*
		336	* If this allocation races such that we no longer need the
		337	* page, free_huge_page will handle it by freeing the page
		338	* and reducing the surplus.
		339	*/
		340	spin_unlock(&hugetlb_lock);
		341	ret = alloc_fresh_huge_page();
		342	spin_lock(&hugetlb_lock);
		343	if (!ret)
		344	goto out;
		345
		346	}
		347	if (count >= persistent_huge_pages)
		348	goto out;
		349
		350	/*
		351	* Decrease the pool size
		352	* First return free pages to the buddy allocator (being careful
		353	* to keep enough around to satisfy reservations). Then place
		354	* pages into surplus state as needed so the pool will shrink
		355	* to the desired size as pages become free.
		356	*/
		357	min_count = max(count, resv_huge_pages);
		358	try_to_free_low(min_count);
		359	while (min_count < persistent_huge_pages) {
257	struct page *page = dequeue_huge_page(NULL, 0);	360	struct page *page = dequeue_huge_page(NULL, 0);
258	if (!page)	361	if (!page)
259	break;	362	break;
260	update_and_free_page(page);	363	update_and_free_page(page);
261	}	364	}
		365	while (count < persistent_huge_pages) {
		366	if (!adjust_pool_surplus(1))
		367	break;
		368	}
		369	out:
		370	ret = persistent_huge_pages;
262	spin_unlock(&hugetlb_lock);	371	spin_unlock(&hugetlb_lock);
263	return nr_huge_pages;	372	return ret;
264	}	373	}
265		374
266	int hugetlb_sysctl_handler(struct ctl_table *table, int write,	375	int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -292,10 +401,12 @@ int hugetlb_report_meminfo(char *buf)
292	"HugePages_Total: %5lu\n"	401	"HugePages_Total: %5lu\n"
293	"HugePages_Free: %5lu\n"	402	"HugePages_Free: %5lu\n"
294	"HugePages_Rsvd: %5lu\n"	403	"HugePages_Rsvd: %5lu\n"
		404	"HugePages_Surp: %5lu\n"
295	"Hugepagesize: %5lu kB\n",	405	"Hugepagesize: %5lu kB\n",
296	nr_huge_pages,	406	nr_huge_pages,
297	free_huge_pages,	407	free_huge_pages,
298	resv_huge_pages,	408	resv_huge_pages,
		409	surplus_huge_pages,
299	HPAGE_SIZE/1024);	410	HPAGE_SIZE/1024);
300	}	411	}
301		412