hugetlb: support larger than MAX_ORDER

This is needed on x86-64 to handle GB pages in hugetlbfs, because it is not practical to enlarge MAX_ORDER to 1GB. Instead the 1GB pages are only allocated at boot using the bootmem allocator using the hugepages=... option. These 1G bootmem pages are never freed. In theory it would be possible to implement that with some complications, but since it would be a one-way street (>= MAX_ORDER pages cannot be allocated later) I decided not to currently. The >= MAX_ORDER code is not ifdef'ed per architecture. It is not very big and the ifdef uglyness seemed not be worth it. Known problems: /proc/meminfo and "free" do not display the memory allocated for gb pages in "Total". This is a little confusing for the user. Acked-by: Andrew Hastings <abh@cray.com> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Andi Kleen <ak@suse.de> 2008-07-24 00:27:47 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-07-24 13:47:18 -0400
commit: aa888a74977a8f2120ae9332376e179c39a6b07d (patch)
tree: 1834f8a81e0126ffdd9d9622a9522331dffa2ac8
parent: 01ad1c0827db5b3695c53e296dbb2c1da16a0911 (diff)
1 files changed, 81 insertions, 2 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5e620e25cf08..1a6fe87555b2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -14,6 +14,7 @@
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
+#include <linux/bootmem.h>
 #include <linux/sysfs.h>
 #include <asm/page.h>
@@ -489,7 +490,7 @@ static void free_huge_page(struct page *page)
        INIT_LIST_HEAD(&page->lru);
        spin_lock(&hugetlb_lock);
-        if (h->surplus_huge_pages_node[nid]) {
+        if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
                update_and_free_page(h, page);
                h->surplus_huge_pages--;
                h->surplus_huge_pages_node[nid]--;
@@ -550,6 +551,9 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
        struct page *page;
+        if (h->order >= MAX_ORDER)
+                return NULL;
        page = alloc_pages_node(nid,
                htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
                                                __GFP_REPEAT|__GFP_NOWARN,
@@ -616,6 +620,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
        struct page *page;
        unsigned int nid;
+        if (h->order >= MAX_ORDER)
+                return NULL;
        /*
         * Assume we will successfully allocate the surplus page to
         * prevent racing processes from causing the surplus to exceed
@@ -792,6 +799,10 @@ static void return_unused_surplus_pages(struct hstate *h,
        /* Uncommit the reservation */
        h->resv_huge_pages -= unused_resv_pages;
+        /* Cannot return gigantic pages currently */
+        if (h->order >= MAX_ORDER)
+                return;
        nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
        while (remaining_iterations-- && nr_pages) {
@@ -913,6 +924,63 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        return page;
 }
+static __initdata LIST_HEAD(huge_boot_pages);
+struct huge_bootmem_page {
+        struct list_head list;
+        struct hstate *hstate;
+};
+static int __init alloc_bootmem_huge_page(struct hstate *h)
+{
+        struct huge_bootmem_page *m;
+        int nr_nodes = nodes_weight(node_online_map);
+        while (nr_nodes) {
+                void *addr;
+                addr = __alloc_bootmem_node_nopanic(
+                                NODE_DATA(h->hugetlb_next_nid),
+                                huge_page_size(h), huge_page_size(h), 0);
+                if (addr) {
+                        /*
+                         * Use the beginning of the huge page to store the
+                         * huge_bootmem_page struct (until gather_bootmem
+                         * puts them into the mem_map).
+                         */
+                        m = addr;
+                        if (m)
+                                goto found;
+                }
+                hstate_next_node(h);
+                nr_nodes--;
+        }
+        return 0;
+found:
+        BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
+        /* Put them into a private list first because mem_map is not up yet */
+        list_add(&m->list, &huge_boot_pages);
+        m->hstate = h;
+        return 1;
+}
+/* Put bootmem huge pages into the standard lists after mem_map is up */
+static void __init gather_bootmem_prealloc(void)
+{
+        struct huge_bootmem_page *m;
+        list_for_each_entry(m, &huge_boot_pages, list) {
+                struct page *page = virt_to_page(m);
+                struct hstate *h = m->hstate;
+                __ClearPageReserved(page);
+                WARN_ON(page_count(page) != 1);
+                prep_compound_page(page, h->order);
+                prep_new_huge_page(h, page, page_to_nid(page));
+        }
+}
 static void __init hugetlb_init_one_hstate(struct hstate *h)
 {
        unsigned long i;
@@ -923,7 +991,10 @@ static void __init hugetlb_init_one_hstate(struct hstate *h)
        h->hugetlb_next_nid = first_node(node_online_map);
        for (i = 0; i < h->max_huge_pages; ++i) {
-                if (!alloc_fresh_huge_page(h))
+                if (h->order >= MAX_ORDER) {
+                        if (!alloc_bootmem_huge_page(h))
+                                break;
+                } else if (!alloc_fresh_huge_page(h))
                        break;
        }
        h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i;
@@ -956,6 +1027,9 @@ static void try_to_free_low(struct hstate *h, unsigned long count)
 {
        int i;
+        if (h->order >= MAX_ORDER)
+                return;
        for (i = 0; i < MAX_NUMNODES; ++i) {
                struct page *page, *next;
                struct list_head *freel = &h->hugepage_freelists[i];
@@ -982,6 +1056,9 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
 {
        unsigned long min_count, ret;
+        if (h->order >= MAX_ORDER)
+                return h->max_huge_pages;
        /*
         * Increase the pool size
         * First take pages out of surplus state.  Then make up the
@@ -1210,6 +1287,8 @@ static int __init hugetlb_init(void)
        hugetlb_init_hstates();
+        gather_bootmem_prealloc();
        report_hugepages();
        hugetlb_sysfs_init();
author	Andi Kleen <ak@suse.de>	2008-07-24 00:27:47 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-07-24 13:47:18 -0400
commit	aa888a74977a8f2120ae9332376e179c39a6b07d (patch)
tree	1834f8a81e0126ffdd9d9622a9522331dffa2ac8
parent	01ad1c0827db5b3695c53e296dbb2c1da16a0911 (diff)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5e620e25cf08..1a6fe87555b2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -14,6 +14,7 @@
14	#include <linux/mempolicy.h>	14	#include <linux/mempolicy.h>
15	#include <linux/cpuset.h>	15	#include <linux/cpuset.h>
16	#include <linux/mutex.h>	16	#include <linux/mutex.h>
		17	#include <linux/bootmem.h>
17	#include <linux/sysfs.h>	18	#include <linux/sysfs.h>
18		19
19	#include <asm/page.h>	20	#include <asm/page.h>
@@ -489,7 +490,7 @@ static void free_huge_page(struct page *page)
489	INIT_LIST_HEAD(&page->lru);	490	INIT_LIST_HEAD(&page->lru);
490		491
491	spin_lock(&hugetlb_lock);	492	spin_lock(&hugetlb_lock);
492	if (h->surplus_huge_pages_node[nid]) {	493	if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
493	update_and_free_page(h, page);	494	update_and_free_page(h, page);
494	h->surplus_huge_pages--;	495	h->surplus_huge_pages--;
495	h->surplus_huge_pages_node[nid]--;	496	h->surplus_huge_pages_node[nid]--;
@@ -550,6 +551,9 @@ static struct page alloc_fresh_huge_page_node(struct hstate h, int nid)
550	{	551	{
551	struct page *page;	552	struct page *page;
552		553
		554	if (h->order >= MAX_ORDER)
		555	return NULL;
		556
553	page = alloc_pages_node(nid,	557	page = alloc_pages_node(nid,
554	htlb_alloc_mask\|__GFP_COMP\|__GFP_THISNODE\|	558	htlb_alloc_mask\|__GFP_COMP\|__GFP_THISNODE\|
555	__GFP_REPEAT\|__GFP_NOWARN,	559	__GFP_REPEAT\|__GFP_NOWARN,
@@ -616,6 +620,9 @@ static struct page alloc_buddy_huge_page(struct hstate h,
616	struct page *page;	620	struct page *page;
617	unsigned int nid;	621	unsigned int nid;
618		622
		623	if (h->order >= MAX_ORDER)
		624	return NULL;
		625
619	/*	626	/*
620	* Assume we will successfully allocate the surplus page to	627	* Assume we will successfully allocate the surplus page to
621	* prevent racing processes from causing the surplus to exceed	628	* prevent racing processes from causing the surplus to exceed
@@ -792,6 +799,10 @@ static void return_unused_surplus_pages(struct hstate *h,
792	/* Uncommit the reservation */	799	/* Uncommit the reservation */
793	h->resv_huge_pages -= unused_resv_pages;	800	h->resv_huge_pages -= unused_resv_pages;
794		801
		802	/* Cannot return gigantic pages currently */
		803	if (h->order >= MAX_ORDER)
		804	return;
		805
795	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);	806	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
796		807
797	while (remaining_iterations-- && nr_pages) {	808	while (remaining_iterations-- && nr_pages) {
@@ -913,6 +924,63 @@ static struct page alloc_huge_page(struct vm_area_struct vma,
913	return page;	924	return page;
914	}	925	}
915		926
		927	static __initdata LIST_HEAD(huge_boot_pages);
		928
		929	struct huge_bootmem_page {
		930	struct list_head list;
		931	struct hstate *hstate;
		932	};
		933
		934	static int __init alloc_bootmem_huge_page(struct hstate *h)
		935	{
		936	struct huge_bootmem_page *m;
		937	int nr_nodes = nodes_weight(node_online_map);
		938
		939	while (nr_nodes) {
		940	void *addr;
		941
		942	addr = __alloc_bootmem_node_nopanic(
		943	NODE_DATA(h->hugetlb_next_nid),
		944	huge_page_size(h), huge_page_size(h), 0);
		945
		946	if (addr) {
		947	/*
		948	* Use the beginning of the huge page to store the
		949	* huge_bootmem_page struct (until gather_bootmem
		950	* puts them into the mem_map).
		951	*/
		952	m = addr;
		953	if (m)
		954	goto found;
		955	}
		956	hstate_next_node(h);
		957	nr_nodes--;
		958	}
		959	return 0;
		960
		961	found:
		962	BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
		963	/* Put them into a private list first because mem_map is not up yet */
		964	list_add(&m->list, &huge_boot_pages);
		965	m->hstate = h;
		966	return 1;
		967	}
		968
		969	/* Put bootmem huge pages into the standard lists after mem_map is up */
		970	static void __init gather_bootmem_prealloc(void)
		971	{
		972	struct huge_bootmem_page *m;
		973
		974	list_for_each_entry(m, &huge_boot_pages, list) {
		975	struct page *page = virt_to_page(m);
		976	struct hstate *h = m->hstate;
		977	__ClearPageReserved(page);
		978	WARN_ON(page_count(page) != 1);
		979	prep_compound_page(page, h->order);
		980	prep_new_huge_page(h, page, page_to_nid(page));
		981	}
		982	}
		983
916	static void __init hugetlb_init_one_hstate(struct hstate *h)	984	static void __init hugetlb_init_one_hstate(struct hstate *h)
917	{	985	{
918	unsigned long i;	986	unsigned long i;
@@ -923,7 +991,10 @@ static void __init hugetlb_init_one_hstate(struct hstate *h)
923	h->hugetlb_next_nid = first_node(node_online_map);	991	h->hugetlb_next_nid = first_node(node_online_map);
924		992
925	for (i = 0; i < h->max_huge_pages; ++i) {	993	for (i = 0; i < h->max_huge_pages; ++i) {
926	if (!alloc_fresh_huge_page(h))	994	if (h->order >= MAX_ORDER) {
		995	if (!alloc_bootmem_huge_page(h))
		996	break;
		997	} else if (!alloc_fresh_huge_page(h))
927	break;	998	break;
928	}	999	}
929	h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i;	1000	h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i;
@@ -956,6 +1027,9 @@ static void try_to_free_low(struct hstate *h, unsigned long count)
956	{	1027	{
957	int i;	1028	int i;
958		1029
		1030	if (h->order >= MAX_ORDER)
		1031	return;
		1032
959	for (i = 0; i < MAX_NUMNODES; ++i) {	1033	for (i = 0; i < MAX_NUMNODES; ++i) {
960	struct page page, next;	1034	struct page page, next;
961	struct list_head *freel = &h->hugepage_freelists[i];	1035	struct list_head *freel = &h->hugepage_freelists[i];
@@ -982,6 +1056,9 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
982	{	1056	{
983	unsigned long min_count, ret;	1057	unsigned long min_count, ret;
984		1058
		1059	if (h->order >= MAX_ORDER)
		1060	return h->max_huge_pages;
		1061
985	/*	1062	/*
986	* Increase the pool size	1063	* Increase the pool size
987	* First take pages out of surplus state. Then make up the	1064	* First take pages out of surplus state. Then make up the
@@ -1210,6 +1287,8 @@ static int __init hugetlb_init(void)
1210		1287
1211	hugetlb_init_hstates();	1288	hugetlb_init_hstates();
1212		1289
		1290	gather_bootmem_prealloc();
		1291
1213	report_hugepages();	1292	report_hugepages();
1214		1293
1215	hugetlb_sysfs_init();	1294	hugetlb_sysfs_init();