hugetlb: Try to grow hugetlb pool for MAP_SHARED mappings

Shared mappings require special handling because the huge pages needed to fully populate the VMA must be reserved at mmap time. If not enough pages are available when making the reservation, allocate all of the shortfall at once from the buddy allocator and add the pages directly to the hugetlb pool. If they cannot be allocated, then fail the mapping. The page surplus is accounted for in the same way as for private mappings; faulted surplus pages will be freed at unmap time. Reserved, surplus pages that have not been used must be freed separately when their reservation has been released. Signed-off-by: Adam Litke <agl@us.ibm.com> Acked-by: Andy Whitcroft <apw@shadowen.org> Acked-by: Dave McCracken <dave.mccracken@oracle.com> Cc: William Irwin <bill.irwin@oracle.com> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: Ken Chen <kenchen@google.com> Cc: Badari Pulavarty <pbadari@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Adam Litke <agl@us.ibm.com> 2007-10-16 04:26:19 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-16 12:43:02 -0400
commit: e4e574b767ba63101cfda2b42d72f38546319297 (patch)
tree: 084b94d01c71ccd898f8df0ec441e6726e657e75 /mm/hugetlb.c
parent: 7893d1d505d59db9d4f35165c8b6d3c6dff40a32 (diff)
1 files changed, 132 insertions, 23 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8768e525032..31bbca6b2c9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -87,6 +87,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
                        list_del(&page->lru);
                        free_huge_pages--;
                        free_huge_pages_node[nid]--;
+                        if (vma && vma->vm_flags & VM_MAYSHARE)
+                                resv_huge_pages--;
                        break;
                }
        }
@@ -214,15 +216,116 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
        return page;
 }
+/*
+ * Increase the hugetlb pool such that it can accomodate a reservation
+ * of size 'delta'.
+ */
+static int gather_surplus_pages(int delta)
+{
+        struct list_head surplus_list;
+        struct page *page, *tmp;
+        int ret, i;
+        int needed, allocated;
+        needed = (resv_huge_pages + delta) - free_huge_pages;
+        if (needed <= 0)
+                return 0;
+        allocated = 0;
+        INIT_LIST_HEAD(&surplus_list);
+        ret = -ENOMEM;
+retry:
+        spin_unlock(&hugetlb_lock);
+        for (i = 0; i < needed; i++) {
+                page = alloc_buddy_huge_page(NULL, 0);
+                if (!page) {
+                        /*
+                         * We were not able to allocate enough pages to
+                         * satisfy the entire reservation so we free what
+                         * we've allocated so far.
+                         */
+                        spin_lock(&hugetlb_lock);
+                        needed = 0;
+                        goto free;
+                }
+                list_add(&page->lru, &surplus_list);
+        }
+        allocated += needed;
+        /*
+         * After retaking hugetlb_lock, we need to recalculate 'needed'
+         * because either resv_huge_pages or free_huge_pages may have changed.
+         */
+        spin_lock(&hugetlb_lock);
+        needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
+        if (needed > 0)
+                goto retry;
+        /*
+         * The surplus_list now contains _at_least_ the number of extra pages
+         * needed to accomodate the reservation.  Add the appropriate number
+         * of pages to the hugetlb pool and free the extras back to the buddy
+         * allocator.
+         */
+        needed += allocated;
+        ret = 0;
+free:
+        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+                list_del(&page->lru);
+                if ((--needed) >= 0)
+                        enqueue_huge_page(page);
+                else
+                        update_and_free_page(page);
+        }
+        return ret;
+}
+/*
+ * When releasing a hugetlb pool reservation, any surplus pages that were
+ * allocated to satisfy the reservation must be explicitly freed if they were
+ * never used.
+ */
+void return_unused_surplus_pages(unsigned long unused_resv_pages)
+{
+        static int nid = -1;
+        struct page *page;
+        unsigned long nr_pages;
+        nr_pages = min(unused_resv_pages, surplus_huge_pages);
+        while (nr_pages) {
+                nid = next_node(nid, node_online_map);
+                if (nid == MAX_NUMNODES)
+                        nid = first_node(node_online_map);
+                if (!surplus_huge_pages_node[nid])
+                        continue;
+                if (!list_empty(&hugepage_freelists[nid])) {
+                        page = list_entry(hugepage_freelists[nid].next,
+                                          struct page, lru);
+                        list_del(&page->lru);
+                        update_and_free_page(page);
+                        free_huge_pages--;
+                        free_huge_pages_node[nid]--;
+                        surplus_huge_pages--;
+                        surplus_huge_pages_node[nid]--;
+                        nr_pages--;
+                }
+        }
+}
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr)
 {
        struct page *page = NULL;
+        int use_reserved_page = vma->vm_flags & VM_MAYSHARE;
        spin_lock(&hugetlb_lock);
-        if (vma->vm_flags & VM_MAYSHARE)
+        if (!use_reserved_page && (free_huge_pages <= resv_huge_pages))
-                resv_huge_pages--;
-        else if (free_huge_pages <= resv_huge_pages)
                goto fail;
        page = dequeue_huge_page(vma, addr);
@@ -234,8 +337,6 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        return page;
 fail:
-        if (vma->vm_flags & VM_MAYSHARE)
-                resv_huge_pages++;
        spin_unlock(&hugetlb_lock);
        /*
@@ -243,7 +344,7 @@ fail:
         * may have failed due to an undersized hugetlb pool.  Try to grab a
         * surplus huge page from the buddy allocator.
         */
-        if (!(vma->vm_flags & VM_MAYSHARE))
+        if (!use_reserved_page)
                page = alloc_buddy_huge_page(vma, addr);
        return page;
@@ -952,21 +1053,6 @@ static int hugetlb_acct_memory(long delta)
        int ret = -ENOMEM;
        spin_lock(&hugetlb_lock);
-        if ((delta + resv_huge_pages) <= free_huge_pages) {
-                resv_huge_pages += delta;
-                ret = 0;
-        }
-        spin_unlock(&hugetlb_lock);
-        return ret;
-}
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
-{
-        long ret, chg;
-        chg = region_chg(&inode->i_mapping->private_list, from, to);
-        if (chg < 0)
-                return chg;
        /*
         * When cpuset is configured, it breaks the strict hugetlb page
         * reservation as the accounting is done on a global variable. Such
@@ -984,8 +1070,31 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
         * a best attempt and hopefully to minimize the impact of changing
         * semantics that cpuset has.
         */
-        if (chg > cpuset_mems_nr(free_huge_pages_node))
+        if (delta > 0) {
-                return -ENOMEM;
+                if (gather_surplus_pages(delta) < 0)
+                        goto out;
+                if (delta > cpuset_mems_nr(free_huge_pages_node))
+                        goto out;
+        }
+        ret = 0;
+        resv_huge_pages += delta;
+        if (delta < 0)
+                return_unused_surplus_pages((unsigned long) -delta);
+out:
+        spin_unlock(&hugetlb_lock);
+        return ret;
+}
+int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+{
+        long ret, chg;
+        chg = region_chg(&inode->i_mapping->private_list, from, to);
+        if (chg < 0)
+                return chg;
        ret = hugetlb_acct_memory(chg);
        if (ret < 0)
author	Adam Litke <agl@us.ibm.com>	2007-10-16 04:26:19 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-16 12:43:02 -0400
commit	e4e574b767ba63101cfda2b42d72f38546319297 (patch)
tree	084b94d01c71ccd898f8df0ec441e6726e657e75 /mm/hugetlb.c
parent	7893d1d505d59db9d4f35165c8b6d3c6dff40a32 (diff)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8768e525032..31bbca6b2c9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -87,6 +87,8 @@ static struct page dequeue_huge_page(struct vm_area_struct vma,
87	list_del(&page->lru);	87	list_del(&page->lru);
88	free_huge_pages--;	88	free_huge_pages--;
89	free_huge_pages_node[nid]--;	89	free_huge_pages_node[nid]--;
		90	if (vma && vma->vm_flags & VM_MAYSHARE)
		91	resv_huge_pages--;
90	break;	92	break;
91	}	93	}
92	}	94	}
@@ -214,15 +216,116 @@ static struct page alloc_buddy_huge_page(struct vm_area_struct vma,
214	return page;	216	return page;
215	}	217	}
216		218
		219	/*
		220	* Increase the hugetlb pool such that it can accomodate a reservation
		221	* of size 'delta'.
		222	*/
		223	static int gather_surplus_pages(int delta)
		224	{
		225	struct list_head surplus_list;
		226	struct page page, tmp;
		227	int ret, i;
		228	int needed, allocated;
		229
		230	needed = (resv_huge_pages + delta) - free_huge_pages;
		231	if (needed <= 0)
		232	return 0;
		233
		234	allocated = 0;
		235	INIT_LIST_HEAD(&surplus_list);
		236
		237	ret = -ENOMEM;
		238	retry:
		239	spin_unlock(&hugetlb_lock);
		240	for (i = 0; i < needed; i++) {
		241	page = alloc_buddy_huge_page(NULL, 0);
		242	if (!page) {
		243	/*
		244	* We were not able to allocate enough pages to
		245	* satisfy the entire reservation so we free what
		246	* we've allocated so far.
		247	*/
		248	spin_lock(&hugetlb_lock);
		249	needed = 0;
		250	goto free;
		251	}
		252
		253	list_add(&page->lru, &surplus_list);
		254	}
		255	allocated += needed;
		256
		257	/*
		258	* After retaking hugetlb_lock, we need to recalculate 'needed'
		259	* because either resv_huge_pages or free_huge_pages may have changed.
		260	*/
		261	spin_lock(&hugetlb_lock);
		262	needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
		263	if (needed > 0)
		264	goto retry;
		265
		266	/*
		267	* The surplus_list now contains _at_least_ the number of extra pages
		268	* needed to accomodate the reservation. Add the appropriate number
		269	* of pages to the hugetlb pool and free the extras back to the buddy
		270	* allocator.
		271	*/
		272	needed += allocated;
		273	ret = 0;
		274	free:
		275	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
		276	list_del(&page->lru);
		277	if ((--needed) >= 0)
		278	enqueue_huge_page(page);
		279	else
		280	update_and_free_page(page);
		281	}
		282
		283	return ret;
		284	}
		285
		286	/*
		287	* When releasing a hugetlb pool reservation, any surplus pages that were
		288	* allocated to satisfy the reservation must be explicitly freed if they were
		289	* never used.
		290	*/
		291	void return_unused_surplus_pages(unsigned long unused_resv_pages)
		292	{
		293	static int nid = -1;
		294	struct page *page;
		295	unsigned long nr_pages;
		296
		297	nr_pages = min(unused_resv_pages, surplus_huge_pages);
		298
		299	while (nr_pages) {
		300	nid = next_node(nid, node_online_map);
		301	if (nid == MAX_NUMNODES)
		302	nid = first_node(node_online_map);
		303
		304	if (!surplus_huge_pages_node[nid])
		305	continue;
		306
		307	if (!list_empty(&hugepage_freelists[nid])) {
		308	page = list_entry(hugepage_freelists[nid].next,
		309	struct page, lru);
		310	list_del(&page->lru);
		311	update_and_free_page(page);
		312	free_huge_pages--;
		313	free_huge_pages_node[nid]--;
		314	surplus_huge_pages--;
		315	surplus_huge_pages_node[nid]--;
		316	nr_pages--;
		317	}
		318	}
		319	}
		320
217	static struct page alloc_huge_page(struct vm_area_struct vma,	321	static struct page alloc_huge_page(struct vm_area_struct vma,
218	unsigned long addr)	322	unsigned long addr)
219	{	323	{
220	struct page *page = NULL;	324	struct page *page = NULL;
		325	int use_reserved_page = vma->vm_flags & VM_MAYSHARE;
221		326
222	spin_lock(&hugetlb_lock);	327	spin_lock(&hugetlb_lock);
223	if (vma->vm_flags & VM_MAYSHARE)	328	if (!use_reserved_page && (free_huge_pages <= resv_huge_pages))
224	resv_huge_pages--;
225	else if (free_huge_pages <= resv_huge_pages)
226	goto fail;	329	goto fail;
227		330
228	page = dequeue_huge_page(vma, addr);	331	page = dequeue_huge_page(vma, addr);
@@ -234,8 +337,6 @@ static struct page alloc_huge_page(struct vm_area_struct vma,
234	return page;	337	return page;
235		338
236	fail:	339	fail:
237	if (vma->vm_flags & VM_MAYSHARE)
238	resv_huge_pages++;
239	spin_unlock(&hugetlb_lock);	340	spin_unlock(&hugetlb_lock);
240		341
241	/*	342	/*
@@ -243,7 +344,7 @@ fail:
243	* may have failed due to an undersized hugetlb pool. Try to grab a	344	* may have failed due to an undersized hugetlb pool. Try to grab a
244	* surplus huge page from the buddy allocator.	345	* surplus huge page from the buddy allocator.
245	*/	346	*/
246	if (!(vma->vm_flags & VM_MAYSHARE))	347	if (!use_reserved_page)
247	page = alloc_buddy_huge_page(vma, addr);	348	page = alloc_buddy_huge_page(vma, addr);
248		349
249	return page;	350	return page;
@@ -952,21 +1053,6 @@ static int hugetlb_acct_memory(long delta)
952	int ret = -ENOMEM;	1053	int ret = -ENOMEM;
953		1054
954	spin_lock(&hugetlb_lock);	1055	spin_lock(&hugetlb_lock);
955	if ((delta + resv_huge_pages) <= free_huge_pages) {
956	resv_huge_pages += delta;
957	ret = 0;
958	}
959	spin_unlock(&hugetlb_lock);
960	return ret;
961	}
962
963	int hugetlb_reserve_pages(struct inode *inode, long from, long to)
964	{
965	long ret, chg;
966
967	chg = region_chg(&inode->i_mapping->private_list, from, to);
968	if (chg < 0)
969	return chg;
970	/*	1056	/*
971	* When cpuset is configured, it breaks the strict hugetlb page	1057	* When cpuset is configured, it breaks the strict hugetlb page
972	* reservation as the accounting is done on a global variable. Such	1058	* reservation as the accounting is done on a global variable. Such
@@ -984,8 +1070,31 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
984	* a best attempt and hopefully to minimize the impact of changing	1070	* a best attempt and hopefully to minimize the impact of changing
985	* semantics that cpuset has.	1071	* semantics that cpuset has.
986	*/	1072	*/
987	if (chg > cpuset_mems_nr(free_huge_pages_node))	1073	if (delta > 0) {
988	return -ENOMEM;	1074	if (gather_surplus_pages(delta) < 0)
		1075	goto out;
		1076
		1077	if (delta > cpuset_mems_nr(free_huge_pages_node))
		1078	goto out;
		1079	}
		1080
		1081	ret = 0;
		1082	resv_huge_pages += delta;
		1083	if (delta < 0)
		1084	return_unused_surplus_pages((unsigned long) -delta);
		1085
		1086	out:
		1087	spin_unlock(&hugetlb_lock);
		1088	return ret;
		1089	}
		1090
		1091	int hugetlb_reserve_pages(struct inode *inode, long from, long to)
		1092	{
		1093	long ret, chg;
		1094
		1095	chg = region_chg(&inode->i_mapping->private_list, from, to);
		1096	if (chg < 0)
		1097	return chg;
989		1098
990	ret = hugetlb_acct_memory(chg);	1099	ret = hugetlb_acct_memory(chg);
991	if (ret < 0)	1100	if (ret < 0)