diff options
author | Lee Schermerhorn <lee.schermerhorn@hp.com> | 2009-12-14 20:58:16 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-15 11:53:12 -0500 |
commit | 6ae11b278bca1cd41651bae49a8c69de2f6a6262 (patch) | |
tree | 8bf4203ce676cac4f5ce1d03b35adbca1571d295 /mm/hugetlb.c | |
parent | 9a76db099709388ae4126c4f441358b97c6ba20c (diff) |
hugetlb: add nodemask arg to huge page alloc, free and surplus adjust functions
In preparation for constraining huge page allocation and freeing by the
controlling task's numa mempolicy, add a "nodes_allowed" nodemask pointer
to the allocate, free and surplus adjustment functions. For now, pass
NULL to indicate default behavior--i.e., use node_online_map. A
subsqeuent patch will derive a non-default mask from the controlling
task's numa mempolicy.
Note that this method of updating the global hstate nr_hugepages under the
constraint of a nodemask simplifies keeping the global state
consistent--especially the number of persistent and surplus pages relative
to reservations and overcommit limits. There are undoubtedly other ways
to do this, but this works for both interfaces: mempolicy and per node
attributes.
[rientjes@google.com: fix HIGHMEM compile error]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Reviewed-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Andi Kleen <andi@firstfloor.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Eric Whitney <eric.whitney@hp.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 125 |
1 files changed, 72 insertions, 53 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bffcf774f60b..324d1abae876 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -622,48 +622,56 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
622 | } | 622 | } |
623 | 623 | ||
624 | /* | 624 | /* |
625 | * common helper function for hstate_next_node_to_{alloc|free}. | 625 | * common helper functions for hstate_next_node_to_{alloc|free}. |
626 | * return next node in node_online_map, wrapping at end. | 626 | * We may have allocated or freed a huge page based on a different |
627 | * nodes_allowed previously, so h->next_node_to_{alloc|free} might | ||
628 | * be outside of *nodes_allowed. Ensure that we use an allowed | ||
629 | * node for alloc or free. | ||
627 | */ | 630 | */ |
628 | static int next_node_allowed(int nid) | 631 | static int next_node_allowed(int nid, nodemask_t *nodes_allowed) |
629 | { | 632 | { |
630 | nid = next_node(nid, node_online_map); | 633 | nid = next_node(nid, *nodes_allowed); |
631 | if (nid == MAX_NUMNODES) | 634 | if (nid == MAX_NUMNODES) |
632 | nid = first_node(node_online_map); | 635 | nid = first_node(*nodes_allowed); |
633 | VM_BUG_ON(nid >= MAX_NUMNODES); | 636 | VM_BUG_ON(nid >= MAX_NUMNODES); |
634 | 637 | ||
635 | return nid; | 638 | return nid; |
636 | } | 639 | } |
637 | 640 | ||
641 | static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) | ||
642 | { | ||
643 | if (!node_isset(nid, *nodes_allowed)) | ||
644 | nid = next_node_allowed(nid, nodes_allowed); | ||
645 | return nid; | ||
646 | } | ||
647 | |||
638 | /* | 648 | /* |
639 | * Use a helper variable to find the next node and then | 649 | * returns the previously saved node ["this node"] from which to |
640 | * copy it back to next_nid_to_alloc afterwards: | 650 | * allocate a persistent huge page for the pool and advance the |
641 | * otherwise there's a window in which a racer might | 651 | * next node from which to allocate, handling wrap at end of node |
642 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. | 652 | * mask. |
643 | * But we don't need to use a spin_lock here: it really | ||
644 | * doesn't matter if occasionally a racer chooses the | ||
645 | * same nid as we do. Move nid forward in the mask even | ||
646 | * if we just successfully allocated a hugepage so that | ||
647 | * the next caller gets hugepages on the next node. | ||
648 | */ | 653 | */ |
649 | static int hstate_next_node_to_alloc(struct hstate *h) | 654 | static int hstate_next_node_to_alloc(struct hstate *h, |
655 | nodemask_t *nodes_allowed) | ||
650 | { | 656 | { |
651 | int nid, next_nid; | 657 | int nid; |
658 | |||
659 | VM_BUG_ON(!nodes_allowed); | ||
660 | |||
661 | nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); | ||
662 | h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); | ||
652 | 663 | ||
653 | nid = h->next_nid_to_alloc; | ||
654 | next_nid = next_node_allowed(nid); | ||
655 | h->next_nid_to_alloc = next_nid; | ||
656 | return nid; | 664 | return nid; |
657 | } | 665 | } |
658 | 666 | ||
659 | static int alloc_fresh_huge_page(struct hstate *h) | 667 | static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) |
660 | { | 668 | { |
661 | struct page *page; | 669 | struct page *page; |
662 | int start_nid; | 670 | int start_nid; |
663 | int next_nid; | 671 | int next_nid; |
664 | int ret = 0; | 672 | int ret = 0; |
665 | 673 | ||
666 | start_nid = hstate_next_node_to_alloc(h); | 674 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
667 | next_nid = start_nid; | 675 | next_nid = start_nid; |
668 | 676 | ||
669 | do { | 677 | do { |
@@ -672,7 +680,7 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
672 | ret = 1; | 680 | ret = 1; |
673 | break; | 681 | break; |
674 | } | 682 | } |
675 | next_nid = hstate_next_node_to_alloc(h); | 683 | next_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
676 | } while (next_nid != start_nid); | 684 | } while (next_nid != start_nid); |
677 | 685 | ||
678 | if (ret) | 686 | if (ret) |
@@ -684,18 +692,20 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
684 | } | 692 | } |
685 | 693 | ||
686 | /* | 694 | /* |
687 | * helper for free_pool_huge_page() - return the next node | 695 | * helper for free_pool_huge_page() - return the previously saved |
688 | * from which to free a huge page. Advance the next node id | 696 | * node ["this node"] from which to free a huge page. Advance the |
689 | * whether or not we find a free huge page to free so that the | 697 | * next node id whether or not we find a free huge page to free so |
690 | * next attempt to free addresses the next node. | 698 | * that the next attempt to free addresses the next node. |
691 | */ | 699 | */ |
692 | static int hstate_next_node_to_free(struct hstate *h) | 700 | static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) |
693 | { | 701 | { |
694 | int nid, next_nid; | 702 | int nid; |
703 | |||
704 | VM_BUG_ON(!nodes_allowed); | ||
705 | |||
706 | nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); | ||
707 | h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); | ||
695 | 708 | ||
696 | nid = h->next_nid_to_free; | ||
697 | next_nid = next_node_allowed(nid); | ||
698 | h->next_nid_to_free = next_nid; | ||
699 | return nid; | 709 | return nid; |
700 | } | 710 | } |
701 | 711 | ||
@@ -705,13 +715,14 @@ static int hstate_next_node_to_free(struct hstate *h) | |||
705 | * balanced over allowed nodes. | 715 | * balanced over allowed nodes. |
706 | * Called with hugetlb_lock locked. | 716 | * Called with hugetlb_lock locked. |
707 | */ | 717 | */ |
708 | static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | 718 | static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, |
719 | bool acct_surplus) | ||
709 | { | 720 | { |
710 | int start_nid; | 721 | int start_nid; |
711 | int next_nid; | 722 | int next_nid; |
712 | int ret = 0; | 723 | int ret = 0; |
713 | 724 | ||
714 | start_nid = hstate_next_node_to_free(h); | 725 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
715 | next_nid = start_nid; | 726 | next_nid = start_nid; |
716 | 727 | ||
717 | do { | 728 | do { |
@@ -735,7 +746,7 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | |||
735 | ret = 1; | 746 | ret = 1; |
736 | break; | 747 | break; |
737 | } | 748 | } |
738 | next_nid = hstate_next_node_to_free(h); | 749 | next_nid = hstate_next_node_to_free(h, nodes_allowed); |
739 | } while (next_nid != start_nid); | 750 | } while (next_nid != start_nid); |
740 | 751 | ||
741 | return ret; | 752 | return ret; |
@@ -937,7 +948,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
937 | * on-line nodes for us and will handle the hstate accounting. | 948 | * on-line nodes for us and will handle the hstate accounting. |
938 | */ | 949 | */ |
939 | while (nr_pages--) { | 950 | while (nr_pages--) { |
940 | if (!free_pool_huge_page(h, 1)) | 951 | if (!free_pool_huge_page(h, &node_online_map, 1)) |
941 | break; | 952 | break; |
942 | } | 953 | } |
943 | } | 954 | } |
@@ -1047,7 +1058,8 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
1047 | void *addr; | 1058 | void *addr; |
1048 | 1059 | ||
1049 | addr = __alloc_bootmem_node_nopanic( | 1060 | addr = __alloc_bootmem_node_nopanic( |
1050 | NODE_DATA(hstate_next_node_to_alloc(h)), | 1061 | NODE_DATA(hstate_next_node_to_alloc(h, |
1062 | &node_online_map)), | ||
1051 | huge_page_size(h), huge_page_size(h), 0); | 1063 | huge_page_size(h), huge_page_size(h), 0); |
1052 | 1064 | ||
1053 | if (addr) { | 1065 | if (addr) { |
@@ -1102,7 +1114,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
1102 | if (h->order >= MAX_ORDER) { | 1114 | if (h->order >= MAX_ORDER) { |
1103 | if (!alloc_bootmem_huge_page(h)) | 1115 | if (!alloc_bootmem_huge_page(h)) |
1104 | break; | 1116 | break; |
1105 | } else if (!alloc_fresh_huge_page(h)) | 1117 | } else if (!alloc_fresh_huge_page(h, &node_online_map)) |
1106 | break; | 1118 | break; |
1107 | } | 1119 | } |
1108 | h->max_huge_pages = i; | 1120 | h->max_huge_pages = i; |
@@ -1144,14 +1156,15 @@ static void __init report_hugepages(void) | |||
1144 | } | 1156 | } |
1145 | 1157 | ||
1146 | #ifdef CONFIG_HIGHMEM | 1158 | #ifdef CONFIG_HIGHMEM |
1147 | static void try_to_free_low(struct hstate *h, unsigned long count) | 1159 | static void try_to_free_low(struct hstate *h, unsigned long count, |
1160 | nodemask_t *nodes_allowed) | ||
1148 | { | 1161 | { |
1149 | int i; | 1162 | int i; |
1150 | 1163 | ||
1151 | if (h->order >= MAX_ORDER) | 1164 | if (h->order >= MAX_ORDER) |
1152 | return; | 1165 | return; |
1153 | 1166 | ||
1154 | for (i = 0; i < MAX_NUMNODES; ++i) { | 1167 | for_each_node_mask(i, *nodes_allowed) { |
1155 | struct page *page, *next; | 1168 | struct page *page, *next; |
1156 | struct list_head *freel = &h->hugepage_freelists[i]; | 1169 | struct list_head *freel = &h->hugepage_freelists[i]; |
1157 | list_for_each_entry_safe(page, next, freel, lru) { | 1170 | list_for_each_entry_safe(page, next, freel, lru) { |
@@ -1167,7 +1180,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count) | |||
1167 | } | 1180 | } |
1168 | } | 1181 | } |
1169 | #else | 1182 | #else |
1170 | static inline void try_to_free_low(struct hstate *h, unsigned long count) | 1183 | static inline void try_to_free_low(struct hstate *h, unsigned long count, |
1184 | nodemask_t *nodes_allowed) | ||
1171 | { | 1185 | { |
1172 | } | 1186 | } |
1173 | #endif | 1187 | #endif |
@@ -1177,7 +1191,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
1177 | * balanced by operating on them in a round-robin fashion. | 1191 | * balanced by operating on them in a round-robin fashion. |
1178 | * Returns 1 if an adjustment was made. | 1192 | * Returns 1 if an adjustment was made. |
1179 | */ | 1193 | */ |
1180 | static int adjust_pool_surplus(struct hstate *h, int delta) | 1194 | static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, |
1195 | int delta) | ||
1181 | { | 1196 | { |
1182 | int start_nid, next_nid; | 1197 | int start_nid, next_nid; |
1183 | int ret = 0; | 1198 | int ret = 0; |
@@ -1185,9 +1200,9 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
1185 | VM_BUG_ON(delta != -1 && delta != 1); | 1200 | VM_BUG_ON(delta != -1 && delta != 1); |
1186 | 1201 | ||
1187 | if (delta < 0) | 1202 | if (delta < 0) |
1188 | start_nid = hstate_next_node_to_alloc(h); | 1203 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
1189 | else | 1204 | else |
1190 | start_nid = hstate_next_node_to_free(h); | 1205 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
1191 | next_nid = start_nid; | 1206 | next_nid = start_nid; |
1192 | 1207 | ||
1193 | do { | 1208 | do { |
@@ -1197,7 +1212,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
1197 | * To shrink on this node, there must be a surplus page | 1212 | * To shrink on this node, there must be a surplus page |
1198 | */ | 1213 | */ |
1199 | if (!h->surplus_huge_pages_node[nid]) { | 1214 | if (!h->surplus_huge_pages_node[nid]) { |
1200 | next_nid = hstate_next_node_to_alloc(h); | 1215 | next_nid = hstate_next_node_to_alloc(h, |
1216 | nodes_allowed); | ||
1201 | continue; | 1217 | continue; |
1202 | } | 1218 | } |
1203 | } | 1219 | } |
@@ -1207,7 +1223,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
1207 | */ | 1223 | */ |
1208 | if (h->surplus_huge_pages_node[nid] >= | 1224 | if (h->surplus_huge_pages_node[nid] >= |
1209 | h->nr_huge_pages_node[nid]) { | 1225 | h->nr_huge_pages_node[nid]) { |
1210 | next_nid = hstate_next_node_to_free(h); | 1226 | next_nid = hstate_next_node_to_free(h, |
1227 | nodes_allowed); | ||
1211 | continue; | 1228 | continue; |
1212 | } | 1229 | } |
1213 | } | 1230 | } |
@@ -1222,7 +1239,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
1222 | } | 1239 | } |
1223 | 1240 | ||
1224 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) | 1241 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
1225 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | 1242 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, |
1243 | nodemask_t *nodes_allowed) | ||
1226 | { | 1244 | { |
1227 | unsigned long min_count, ret; | 1245 | unsigned long min_count, ret; |
1228 | 1246 | ||
@@ -1242,7 +1260,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1242 | */ | 1260 | */ |
1243 | spin_lock(&hugetlb_lock); | 1261 | spin_lock(&hugetlb_lock); |
1244 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { | 1262 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { |
1245 | if (!adjust_pool_surplus(h, -1)) | 1263 | if (!adjust_pool_surplus(h, nodes_allowed, -1)) |
1246 | break; | 1264 | break; |
1247 | } | 1265 | } |
1248 | 1266 | ||
@@ -1253,7 +1271,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1253 | * and reducing the surplus. | 1271 | * and reducing the surplus. |
1254 | */ | 1272 | */ |
1255 | spin_unlock(&hugetlb_lock); | 1273 | spin_unlock(&hugetlb_lock); |
1256 | ret = alloc_fresh_huge_page(h); | 1274 | ret = alloc_fresh_huge_page(h, nodes_allowed); |
1257 | spin_lock(&hugetlb_lock); | 1275 | spin_lock(&hugetlb_lock); |
1258 | if (!ret) | 1276 | if (!ret) |
1259 | goto out; | 1277 | goto out; |
@@ -1277,13 +1295,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1277 | */ | 1295 | */ |
1278 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; | 1296 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; |
1279 | min_count = max(count, min_count); | 1297 | min_count = max(count, min_count); |
1280 | try_to_free_low(h, min_count); | 1298 | try_to_free_low(h, min_count, nodes_allowed); |
1281 | while (min_count < persistent_huge_pages(h)) { | 1299 | while (min_count < persistent_huge_pages(h)) { |
1282 | if (!free_pool_huge_page(h, 0)) | 1300 | if (!free_pool_huge_page(h, nodes_allowed, 0)) |
1283 | break; | 1301 | break; |
1284 | } | 1302 | } |
1285 | while (count < persistent_huge_pages(h)) { | 1303 | while (count < persistent_huge_pages(h)) { |
1286 | if (!adjust_pool_surplus(h, 1)) | 1304 | if (!adjust_pool_surplus(h, nodes_allowed, 1)) |
1287 | break; | 1305 | break; |
1288 | } | 1306 | } |
1289 | out: | 1307 | out: |
@@ -1329,7 +1347,7 @@ static ssize_t nr_hugepages_store(struct kobject *kobj, | |||
1329 | if (err) | 1347 | if (err) |
1330 | return 0; | 1348 | return 0; |
1331 | 1349 | ||
1332 | h->max_huge_pages = set_max_huge_pages(h, input); | 1350 | h->max_huge_pages = set_max_huge_pages(h, input, &node_online_map); |
1333 | 1351 | ||
1334 | return count; | 1352 | return count; |
1335 | } | 1353 | } |
@@ -1571,7 +1589,8 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, | |||
1571 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1589 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1572 | 1590 | ||
1573 | if (write) | 1591 | if (write) |
1574 | h->max_huge_pages = set_max_huge_pages(h, tmp); | 1592 | h->max_huge_pages = set_max_huge_pages(h, tmp, |
1593 | &node_online_map); | ||
1575 | 1594 | ||
1576 | return 0; | 1595 | return 0; |
1577 | } | 1596 | } |