diff options
author | Andi Kleen <ak@suse.de> | 2008-07-24 00:27:47 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-24 13:47:18 -0400 |
commit | aa888a74977a8f2120ae9332376e179c39a6b07d (patch) | |
tree | 1834f8a81e0126ffdd9d9622a9522331dffa2ac8 /mm/hugetlb.c | |
parent | 01ad1c0827db5b3695c53e296dbb2c1da16a0911 (diff) |
hugetlb: support larger than MAX_ORDER
This is needed on x86-64 to handle GB pages in hugetlbfs, because it is
not practical to enlarge MAX_ORDER to 1GB.
Instead the 1GB pages are only allocated at boot using the bootmem
allocator using the hugepages=... option.
These 1G bootmem pages are never freed. In theory it would be possible to
implement that with some complications, but since it would be a one-way
street (>= MAX_ORDER pages cannot be allocated later) I decided not to
currently.
The >= MAX_ORDER code is not ifdef'ed per architecture. It is not very
big and the ifdef uglyness seemed not be worth it.
Known problems: /proc/meminfo and "free" do not display the memory
allocated for gb pages in "Total". This is a little confusing for the
user.
Acked-by: Andrew Hastings <abh@cray.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 83 |
1 files changed, 81 insertions, 2 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5e620e25cf08..1a6fe87555b2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/mempolicy.h> | 14 | #include <linux/mempolicy.h> |
15 | #include <linux/cpuset.h> | 15 | #include <linux/cpuset.h> |
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | #include <linux/bootmem.h> | ||
17 | #include <linux/sysfs.h> | 18 | #include <linux/sysfs.h> |
18 | 19 | ||
19 | #include <asm/page.h> | 20 | #include <asm/page.h> |
@@ -489,7 +490,7 @@ static void free_huge_page(struct page *page) | |||
489 | INIT_LIST_HEAD(&page->lru); | 490 | INIT_LIST_HEAD(&page->lru); |
490 | 491 | ||
491 | spin_lock(&hugetlb_lock); | 492 | spin_lock(&hugetlb_lock); |
492 | if (h->surplus_huge_pages_node[nid]) { | 493 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { |
493 | update_and_free_page(h, page); | 494 | update_and_free_page(h, page); |
494 | h->surplus_huge_pages--; | 495 | h->surplus_huge_pages--; |
495 | h->surplus_huge_pages_node[nid]--; | 496 | h->surplus_huge_pages_node[nid]--; |
@@ -550,6 +551,9 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
550 | { | 551 | { |
551 | struct page *page; | 552 | struct page *page; |
552 | 553 | ||
554 | if (h->order >= MAX_ORDER) | ||
555 | return NULL; | ||
556 | |||
553 | page = alloc_pages_node(nid, | 557 | page = alloc_pages_node(nid, |
554 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | 558 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| |
555 | __GFP_REPEAT|__GFP_NOWARN, | 559 | __GFP_REPEAT|__GFP_NOWARN, |
@@ -616,6 +620,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
616 | struct page *page; | 620 | struct page *page; |
617 | unsigned int nid; | 621 | unsigned int nid; |
618 | 622 | ||
623 | if (h->order >= MAX_ORDER) | ||
624 | return NULL; | ||
625 | |||
619 | /* | 626 | /* |
620 | * Assume we will successfully allocate the surplus page to | 627 | * Assume we will successfully allocate the surplus page to |
621 | * prevent racing processes from causing the surplus to exceed | 628 | * prevent racing processes from causing the surplus to exceed |
@@ -792,6 +799,10 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
792 | /* Uncommit the reservation */ | 799 | /* Uncommit the reservation */ |
793 | h->resv_huge_pages -= unused_resv_pages; | 800 | h->resv_huge_pages -= unused_resv_pages; |
794 | 801 | ||
802 | /* Cannot return gigantic pages currently */ | ||
803 | if (h->order >= MAX_ORDER) | ||
804 | return; | ||
805 | |||
795 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); | 806 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); |
796 | 807 | ||
797 | while (remaining_iterations-- && nr_pages) { | 808 | while (remaining_iterations-- && nr_pages) { |
@@ -913,6 +924,63 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
913 | return page; | 924 | return page; |
914 | } | 925 | } |
915 | 926 | ||
927 | static __initdata LIST_HEAD(huge_boot_pages); | ||
928 | |||
929 | struct huge_bootmem_page { | ||
930 | struct list_head list; | ||
931 | struct hstate *hstate; | ||
932 | }; | ||
933 | |||
934 | static int __init alloc_bootmem_huge_page(struct hstate *h) | ||
935 | { | ||
936 | struct huge_bootmem_page *m; | ||
937 | int nr_nodes = nodes_weight(node_online_map); | ||
938 | |||
939 | while (nr_nodes) { | ||
940 | void *addr; | ||
941 | |||
942 | addr = __alloc_bootmem_node_nopanic( | ||
943 | NODE_DATA(h->hugetlb_next_nid), | ||
944 | huge_page_size(h), huge_page_size(h), 0); | ||
945 | |||
946 | if (addr) { | ||
947 | /* | ||
948 | * Use the beginning of the huge page to store the | ||
949 | * huge_bootmem_page struct (until gather_bootmem | ||
950 | * puts them into the mem_map). | ||
951 | */ | ||
952 | m = addr; | ||
953 | if (m) | ||
954 | goto found; | ||
955 | } | ||
956 | hstate_next_node(h); | ||
957 | nr_nodes--; | ||
958 | } | ||
959 | return 0; | ||
960 | |||
961 | found: | ||
962 | BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); | ||
963 | /* Put them into a private list first because mem_map is not up yet */ | ||
964 | list_add(&m->list, &huge_boot_pages); | ||
965 | m->hstate = h; | ||
966 | return 1; | ||
967 | } | ||
968 | |||
969 | /* Put bootmem huge pages into the standard lists after mem_map is up */ | ||
970 | static void __init gather_bootmem_prealloc(void) | ||
971 | { | ||
972 | struct huge_bootmem_page *m; | ||
973 | |||
974 | list_for_each_entry(m, &huge_boot_pages, list) { | ||
975 | struct page *page = virt_to_page(m); | ||
976 | struct hstate *h = m->hstate; | ||
977 | __ClearPageReserved(page); | ||
978 | WARN_ON(page_count(page) != 1); | ||
979 | prep_compound_page(page, h->order); | ||
980 | prep_new_huge_page(h, page, page_to_nid(page)); | ||
981 | } | ||
982 | } | ||
983 | |||
916 | static void __init hugetlb_init_one_hstate(struct hstate *h) | 984 | static void __init hugetlb_init_one_hstate(struct hstate *h) |
917 | { | 985 | { |
918 | unsigned long i; | 986 | unsigned long i; |
@@ -923,7 +991,10 @@ static void __init hugetlb_init_one_hstate(struct hstate *h) | |||
923 | h->hugetlb_next_nid = first_node(node_online_map); | 991 | h->hugetlb_next_nid = first_node(node_online_map); |
924 | 992 | ||
925 | for (i = 0; i < h->max_huge_pages; ++i) { | 993 | for (i = 0; i < h->max_huge_pages; ++i) { |
926 | if (!alloc_fresh_huge_page(h)) | 994 | if (h->order >= MAX_ORDER) { |
995 | if (!alloc_bootmem_huge_page(h)) | ||
996 | break; | ||
997 | } else if (!alloc_fresh_huge_page(h)) | ||
927 | break; | 998 | break; |
928 | } | 999 | } |
929 | h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; | 1000 | h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; |
@@ -956,6 +1027,9 @@ static void try_to_free_low(struct hstate *h, unsigned long count) | |||
956 | { | 1027 | { |
957 | int i; | 1028 | int i; |
958 | 1029 | ||
1030 | if (h->order >= MAX_ORDER) | ||
1031 | return; | ||
1032 | |||
959 | for (i = 0; i < MAX_NUMNODES; ++i) { | 1033 | for (i = 0; i < MAX_NUMNODES; ++i) { |
960 | struct page *page, *next; | 1034 | struct page *page, *next; |
961 | struct list_head *freel = &h->hugepage_freelists[i]; | 1035 | struct list_head *freel = &h->hugepage_freelists[i]; |
@@ -982,6 +1056,9 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
982 | { | 1056 | { |
983 | unsigned long min_count, ret; | 1057 | unsigned long min_count, ret; |
984 | 1058 | ||
1059 | if (h->order >= MAX_ORDER) | ||
1060 | return h->max_huge_pages; | ||
1061 | |||
985 | /* | 1062 | /* |
986 | * Increase the pool size | 1063 | * Increase the pool size |
987 | * First take pages out of surplus state. Then make up the | 1064 | * First take pages out of surplus state. Then make up the |
@@ -1210,6 +1287,8 @@ static int __init hugetlb_init(void) | |||
1210 | 1287 | ||
1211 | hugetlb_init_hstates(); | 1288 | hugetlb_init_hstates(); |
1212 | 1289 | ||
1290 | gather_bootmem_prealloc(); | ||
1291 | |||
1213 | report_hugepages(); | 1292 | report_hugepages(); |
1214 | 1293 | ||
1215 | hugetlb_sysfs_init(); | 1294 | hugetlb_sysfs_init(); |