diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 565 |
1 files changed, 460 insertions, 105 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d7601b02874..4c9e6bbf3772 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -2,7 +2,6 @@ | |||
2 | * Generic hugetlb support. | 2 | * Generic hugetlb support. |
3 | * (C) William Irwin, April 2004 | 3 | * (C) William Irwin, April 2004 |
4 | */ | 4 | */ |
5 | #include <linux/gfp.h> | ||
6 | #include <linux/list.h> | 5 | #include <linux/list.h> |
7 | #include <linux/init.h> | 6 | #include <linux/init.h> |
8 | #include <linux/module.h> | 7 | #include <linux/module.h> |
@@ -18,12 +17,14 @@ | |||
18 | #include <linux/mutex.h> | 17 | #include <linux/mutex.h> |
19 | #include <linux/bootmem.h> | 18 | #include <linux/bootmem.h> |
20 | #include <linux/sysfs.h> | 19 | #include <linux/sysfs.h> |
20 | #include <linux/slab.h> | ||
21 | 21 | ||
22 | #include <asm/page.h> | 22 | #include <asm/page.h> |
23 | #include <asm/pgtable.h> | 23 | #include <asm/pgtable.h> |
24 | #include <asm/io.h> | 24 | #include <asm/io.h> |
25 | 25 | ||
26 | #include <linux/hugetlb.h> | 26 | #include <linux/hugetlb.h> |
27 | #include <linux/node.h> | ||
27 | #include "internal.h" | 28 | #include "internal.h" |
28 | 29 | ||
29 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 30 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
@@ -401,7 +402,7 @@ static void clear_huge_page(struct page *page, | |||
401 | { | 402 | { |
402 | int i; | 403 | int i; |
403 | 404 | ||
404 | if (unlikely(sz > MAX_ORDER_NR_PAGES)) { | 405 | if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { |
405 | clear_gigantic_page(page, addr, sz); | 406 | clear_gigantic_page(page, addr, sz); |
406 | return; | 407 | return; |
407 | } | 408 | } |
@@ -545,6 +546,7 @@ static void free_huge_page(struct page *page) | |||
545 | 546 | ||
546 | mapping = (struct address_space *) page_private(page); | 547 | mapping = (struct address_space *) page_private(page); |
547 | set_page_private(page, 0); | 548 | set_page_private(page, 0); |
549 | page->mapping = NULL; | ||
548 | BUG_ON(page_count(page)); | 550 | BUG_ON(page_count(page)); |
549 | INIT_LIST_HEAD(&page->lru); | 551 | INIT_LIST_HEAD(&page->lru); |
550 | 552 | ||
@@ -622,42 +624,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
622 | } | 624 | } |
623 | 625 | ||
624 | /* | 626 | /* |
625 | * Use a helper variable to find the next node and then | 627 | * common helper functions for hstate_next_node_to_{alloc|free}. |
626 | * copy it back to next_nid_to_alloc afterwards: | 628 | * We may have allocated or freed a huge page based on a different |
627 | * otherwise there's a window in which a racer might | 629 | * nodes_allowed previously, so h->next_node_to_{alloc|free} might |
628 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. | 630 | * be outside of *nodes_allowed. Ensure that we use an allowed |
629 | * But we don't need to use a spin_lock here: it really | 631 | * node for alloc or free. |
630 | * doesn't matter if occasionally a racer chooses the | ||
631 | * same nid as we do. Move nid forward in the mask even | ||
632 | * if we just successfully allocated a hugepage so that | ||
633 | * the next caller gets hugepages on the next node. | ||
634 | */ | 632 | */ |
635 | static int hstate_next_node_to_alloc(struct hstate *h) | 633 | static int next_node_allowed(int nid, nodemask_t *nodes_allowed) |
636 | { | 634 | { |
637 | int next_nid; | 635 | nid = next_node(nid, *nodes_allowed); |
638 | next_nid = next_node(h->next_nid_to_alloc, node_online_map); | 636 | if (nid == MAX_NUMNODES) |
639 | if (next_nid == MAX_NUMNODES) | 637 | nid = first_node(*nodes_allowed); |
640 | next_nid = first_node(node_online_map); | 638 | VM_BUG_ON(nid >= MAX_NUMNODES); |
641 | h->next_nid_to_alloc = next_nid; | 639 | |
642 | return next_nid; | 640 | return nid; |
643 | } | 641 | } |
644 | 642 | ||
645 | static int alloc_fresh_huge_page(struct hstate *h) | 643 | static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) |
644 | { | ||
645 | if (!node_isset(nid, *nodes_allowed)) | ||
646 | nid = next_node_allowed(nid, nodes_allowed); | ||
647 | return nid; | ||
648 | } | ||
649 | |||
650 | /* | ||
651 | * returns the previously saved node ["this node"] from which to | ||
652 | * allocate a persistent huge page for the pool and advance the | ||
653 | * next node from which to allocate, handling wrap at end of node | ||
654 | * mask. | ||
655 | */ | ||
656 | static int hstate_next_node_to_alloc(struct hstate *h, | ||
657 | nodemask_t *nodes_allowed) | ||
658 | { | ||
659 | int nid; | ||
660 | |||
661 | VM_BUG_ON(!nodes_allowed); | ||
662 | |||
663 | nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); | ||
664 | h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); | ||
665 | |||
666 | return nid; | ||
667 | } | ||
668 | |||
669 | static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) | ||
646 | { | 670 | { |
647 | struct page *page; | 671 | struct page *page; |
648 | int start_nid; | 672 | int start_nid; |
649 | int next_nid; | 673 | int next_nid; |
650 | int ret = 0; | 674 | int ret = 0; |
651 | 675 | ||
652 | start_nid = h->next_nid_to_alloc; | 676 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
653 | next_nid = start_nid; | 677 | next_nid = start_nid; |
654 | 678 | ||
655 | do { | 679 | do { |
656 | page = alloc_fresh_huge_page_node(h, next_nid); | 680 | page = alloc_fresh_huge_page_node(h, next_nid); |
657 | if (page) | 681 | if (page) { |
658 | ret = 1; | 682 | ret = 1; |
659 | next_nid = hstate_next_node_to_alloc(h); | 683 | break; |
660 | } while (!page && next_nid != start_nid); | 684 | } |
685 | next_nid = hstate_next_node_to_alloc(h, nodes_allowed); | ||
686 | } while (next_nid != start_nid); | ||
661 | 687 | ||
662 | if (ret) | 688 | if (ret) |
663 | count_vm_event(HTLB_BUDDY_PGALLOC); | 689 | count_vm_event(HTLB_BUDDY_PGALLOC); |
@@ -668,17 +694,21 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
668 | } | 694 | } |
669 | 695 | ||
670 | /* | 696 | /* |
671 | * helper for free_pool_huge_page() - find next node | 697 | * helper for free_pool_huge_page() - return the previously saved |
672 | * from which to free a huge page | 698 | * node ["this node"] from which to free a huge page. Advance the |
699 | * next node id whether or not we find a free huge page to free so | ||
700 | * that the next attempt to free addresses the next node. | ||
673 | */ | 701 | */ |
674 | static int hstate_next_node_to_free(struct hstate *h) | 702 | static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) |
675 | { | 703 | { |
676 | int next_nid; | 704 | int nid; |
677 | next_nid = next_node(h->next_nid_to_free, node_online_map); | 705 | |
678 | if (next_nid == MAX_NUMNODES) | 706 | VM_BUG_ON(!nodes_allowed); |
679 | next_nid = first_node(node_online_map); | 707 | |
680 | h->next_nid_to_free = next_nid; | 708 | nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); |
681 | return next_nid; | 709 | h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); |
710 | |||
711 | return nid; | ||
682 | } | 712 | } |
683 | 713 | ||
684 | /* | 714 | /* |
@@ -687,13 +717,14 @@ static int hstate_next_node_to_free(struct hstate *h) | |||
687 | * balanced over allowed nodes. | 717 | * balanced over allowed nodes. |
688 | * Called with hugetlb_lock locked. | 718 | * Called with hugetlb_lock locked. |
689 | */ | 719 | */ |
690 | static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | 720 | static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, |
721 | bool acct_surplus) | ||
691 | { | 722 | { |
692 | int start_nid; | 723 | int start_nid; |
693 | int next_nid; | 724 | int next_nid; |
694 | int ret = 0; | 725 | int ret = 0; |
695 | 726 | ||
696 | start_nid = h->next_nid_to_free; | 727 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
697 | next_nid = start_nid; | 728 | next_nid = start_nid; |
698 | 729 | ||
699 | do { | 730 | do { |
@@ -715,9 +746,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | |||
715 | } | 746 | } |
716 | update_and_free_page(h, page); | 747 | update_and_free_page(h, page); |
717 | ret = 1; | 748 | ret = 1; |
749 | break; | ||
718 | } | 750 | } |
719 | next_nid = hstate_next_node_to_free(h); | 751 | next_nid = hstate_next_node_to_free(h, nodes_allowed); |
720 | } while (!ret && next_nid != start_nid); | 752 | } while (next_nid != start_nid); |
721 | 753 | ||
722 | return ret; | 754 | return ret; |
723 | } | 755 | } |
@@ -911,14 +943,14 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
911 | 943 | ||
912 | /* | 944 | /* |
913 | * We want to release as many surplus pages as possible, spread | 945 | * We want to release as many surplus pages as possible, spread |
914 | * evenly across all nodes. Iterate across all nodes until we | 946 | * evenly across all nodes with memory. Iterate across these nodes |
915 | * can no longer free unreserved surplus pages. This occurs when | 947 | * until we can no longer free unreserved surplus pages. This occurs |
916 | * the nodes with surplus pages have no free pages. | 948 | * when the nodes with surplus pages have no free pages. |
917 | * free_pool_huge_page() will balance the the frees across the | 949 | * free_pool_huge_page() will balance the the freed pages across the |
918 | * on-line nodes for us and will handle the hstate accounting. | 950 | * on-line nodes with memory and will handle the hstate accounting. |
919 | */ | 951 | */ |
920 | while (nr_pages--) { | 952 | while (nr_pages--) { |
921 | if (!free_pool_huge_page(h, 1)) | 953 | if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) |
922 | break; | 954 | break; |
923 | } | 955 | } |
924 | } | 956 | } |
@@ -1007,7 +1039,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1007 | page = alloc_buddy_huge_page(h, vma, addr); | 1039 | page = alloc_buddy_huge_page(h, vma, addr); |
1008 | if (!page) { | 1040 | if (!page) { |
1009 | hugetlb_put_quota(inode->i_mapping, chg); | 1041 | hugetlb_put_quota(inode->i_mapping, chg); |
1010 | return ERR_PTR(-VM_FAULT_OOM); | 1042 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1011 | } | 1043 | } |
1012 | } | 1044 | } |
1013 | 1045 | ||
@@ -1022,16 +1054,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1022 | int __weak alloc_bootmem_huge_page(struct hstate *h) | 1054 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
1023 | { | 1055 | { |
1024 | struct huge_bootmem_page *m; | 1056 | struct huge_bootmem_page *m; |
1025 | int nr_nodes = nodes_weight(node_online_map); | 1057 | int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); |
1026 | 1058 | ||
1027 | while (nr_nodes) { | 1059 | while (nr_nodes) { |
1028 | void *addr; | 1060 | void *addr; |
1029 | 1061 | ||
1030 | addr = __alloc_bootmem_node_nopanic( | 1062 | addr = __alloc_bootmem_node_nopanic( |
1031 | NODE_DATA(h->next_nid_to_alloc), | 1063 | NODE_DATA(hstate_next_node_to_alloc(h, |
1064 | &node_states[N_HIGH_MEMORY])), | ||
1032 | huge_page_size(h), huge_page_size(h), 0); | 1065 | huge_page_size(h), huge_page_size(h), 0); |
1033 | 1066 | ||
1034 | hstate_next_node_to_alloc(h); | ||
1035 | if (addr) { | 1067 | if (addr) { |
1036 | /* | 1068 | /* |
1037 | * Use the beginning of the huge page to store the | 1069 | * Use the beginning of the huge page to store the |
@@ -1084,7 +1116,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
1084 | if (h->order >= MAX_ORDER) { | 1116 | if (h->order >= MAX_ORDER) { |
1085 | if (!alloc_bootmem_huge_page(h)) | 1117 | if (!alloc_bootmem_huge_page(h)) |
1086 | break; | 1118 | break; |
1087 | } else if (!alloc_fresh_huge_page(h)) | 1119 | } else if (!alloc_fresh_huge_page(h, |
1120 | &node_states[N_HIGH_MEMORY])) | ||
1088 | break; | 1121 | break; |
1089 | } | 1122 | } |
1090 | h->max_huge_pages = i; | 1123 | h->max_huge_pages = i; |
@@ -1126,14 +1159,15 @@ static void __init report_hugepages(void) | |||
1126 | } | 1159 | } |
1127 | 1160 | ||
1128 | #ifdef CONFIG_HIGHMEM | 1161 | #ifdef CONFIG_HIGHMEM |
1129 | static void try_to_free_low(struct hstate *h, unsigned long count) | 1162 | static void try_to_free_low(struct hstate *h, unsigned long count, |
1163 | nodemask_t *nodes_allowed) | ||
1130 | { | 1164 | { |
1131 | int i; | 1165 | int i; |
1132 | 1166 | ||
1133 | if (h->order >= MAX_ORDER) | 1167 | if (h->order >= MAX_ORDER) |
1134 | return; | 1168 | return; |
1135 | 1169 | ||
1136 | for (i = 0; i < MAX_NUMNODES; ++i) { | 1170 | for_each_node_mask(i, *nodes_allowed) { |
1137 | struct page *page, *next; | 1171 | struct page *page, *next; |
1138 | struct list_head *freel = &h->hugepage_freelists[i]; | 1172 | struct list_head *freel = &h->hugepage_freelists[i]; |
1139 | list_for_each_entry_safe(page, next, freel, lru) { | 1173 | list_for_each_entry_safe(page, next, freel, lru) { |
@@ -1149,7 +1183,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count) | |||
1149 | } | 1183 | } |
1150 | } | 1184 | } |
1151 | #else | 1185 | #else |
1152 | static inline void try_to_free_low(struct hstate *h, unsigned long count) | 1186 | static inline void try_to_free_low(struct hstate *h, unsigned long count, |
1187 | nodemask_t *nodes_allowed) | ||
1153 | { | 1188 | { |
1154 | } | 1189 | } |
1155 | #endif | 1190 | #endif |
@@ -1159,7 +1194,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
1159 | * balanced by operating on them in a round-robin fashion. | 1194 | * balanced by operating on them in a round-robin fashion. |
1160 | * Returns 1 if an adjustment was made. | 1195 | * Returns 1 if an adjustment was made. |
1161 | */ | 1196 | */ |
1162 | static int adjust_pool_surplus(struct hstate *h, int delta) | 1197 | static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, |
1198 | int delta) | ||
1163 | { | 1199 | { |
1164 | int start_nid, next_nid; | 1200 | int start_nid, next_nid; |
1165 | int ret = 0; | 1201 | int ret = 0; |
@@ -1167,29 +1203,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
1167 | VM_BUG_ON(delta != -1 && delta != 1); | 1203 | VM_BUG_ON(delta != -1 && delta != 1); |
1168 | 1204 | ||
1169 | if (delta < 0) | 1205 | if (delta < 0) |
1170 | start_nid = h->next_nid_to_alloc; | 1206 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
1171 | else | 1207 | else |
1172 | start_nid = h->next_nid_to_free; | 1208 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
1173 | next_nid = start_nid; | 1209 | next_nid = start_nid; |
1174 | 1210 | ||
1175 | do { | 1211 | do { |
1176 | int nid = next_nid; | 1212 | int nid = next_nid; |
1177 | if (delta < 0) { | 1213 | if (delta < 0) { |
1178 | next_nid = hstate_next_node_to_alloc(h); | ||
1179 | /* | 1214 | /* |
1180 | * To shrink on this node, there must be a surplus page | 1215 | * To shrink on this node, there must be a surplus page |
1181 | */ | 1216 | */ |
1182 | if (!h->surplus_huge_pages_node[nid]) | 1217 | if (!h->surplus_huge_pages_node[nid]) { |
1218 | next_nid = hstate_next_node_to_alloc(h, | ||
1219 | nodes_allowed); | ||
1183 | continue; | 1220 | continue; |
1221 | } | ||
1184 | } | 1222 | } |
1185 | if (delta > 0) { | 1223 | if (delta > 0) { |
1186 | next_nid = hstate_next_node_to_free(h); | ||
1187 | /* | 1224 | /* |
1188 | * Surplus cannot exceed the total number of pages | 1225 | * Surplus cannot exceed the total number of pages |
1189 | */ | 1226 | */ |
1190 | if (h->surplus_huge_pages_node[nid] >= | 1227 | if (h->surplus_huge_pages_node[nid] >= |
1191 | h->nr_huge_pages_node[nid]) | 1228 | h->nr_huge_pages_node[nid]) { |
1229 | next_nid = hstate_next_node_to_free(h, | ||
1230 | nodes_allowed); | ||
1192 | continue; | 1231 | continue; |
1232 | } | ||
1193 | } | 1233 | } |
1194 | 1234 | ||
1195 | h->surplus_huge_pages += delta; | 1235 | h->surplus_huge_pages += delta; |
@@ -1202,7 +1242,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
1202 | } | 1242 | } |
1203 | 1243 | ||
1204 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) | 1244 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
1205 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | 1245 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, |
1246 | nodemask_t *nodes_allowed) | ||
1206 | { | 1247 | { |
1207 | unsigned long min_count, ret; | 1248 | unsigned long min_count, ret; |
1208 | 1249 | ||
@@ -1222,7 +1263,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1222 | */ | 1263 | */ |
1223 | spin_lock(&hugetlb_lock); | 1264 | spin_lock(&hugetlb_lock); |
1224 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { | 1265 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { |
1225 | if (!adjust_pool_surplus(h, -1)) | 1266 | if (!adjust_pool_surplus(h, nodes_allowed, -1)) |
1226 | break; | 1267 | break; |
1227 | } | 1268 | } |
1228 | 1269 | ||
@@ -1233,11 +1274,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1233 | * and reducing the surplus. | 1274 | * and reducing the surplus. |
1234 | */ | 1275 | */ |
1235 | spin_unlock(&hugetlb_lock); | 1276 | spin_unlock(&hugetlb_lock); |
1236 | ret = alloc_fresh_huge_page(h); | 1277 | ret = alloc_fresh_huge_page(h, nodes_allowed); |
1237 | spin_lock(&hugetlb_lock); | 1278 | spin_lock(&hugetlb_lock); |
1238 | if (!ret) | 1279 | if (!ret) |
1239 | goto out; | 1280 | goto out; |
1240 | 1281 | ||
1282 | /* Bail for signals. Probably ctrl-c from user */ | ||
1283 | if (signal_pending(current)) | ||
1284 | goto out; | ||
1241 | } | 1285 | } |
1242 | 1286 | ||
1243 | /* | 1287 | /* |
@@ -1257,13 +1301,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1257 | */ | 1301 | */ |
1258 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; | 1302 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; |
1259 | min_count = max(count, min_count); | 1303 | min_count = max(count, min_count); |
1260 | try_to_free_low(h, min_count); | 1304 | try_to_free_low(h, min_count, nodes_allowed); |
1261 | while (min_count < persistent_huge_pages(h)) { | 1305 | while (min_count < persistent_huge_pages(h)) { |
1262 | if (!free_pool_huge_page(h, 0)) | 1306 | if (!free_pool_huge_page(h, nodes_allowed, 0)) |
1263 | break; | 1307 | break; |
1264 | } | 1308 | } |
1265 | while (count < persistent_huge_pages(h)) { | 1309 | while (count < persistent_huge_pages(h)) { |
1266 | if (!adjust_pool_surplus(h, 1)) | 1310 | if (!adjust_pool_surplus(h, nodes_allowed, 1)) |
1267 | break; | 1311 | break; |
1268 | } | 1312 | } |
1269 | out: | 1313 | out: |
@@ -1282,43 +1326,117 @@ out: | |||
1282 | static struct kobject *hugepages_kobj; | 1326 | static struct kobject *hugepages_kobj; |
1283 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | 1327 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; |
1284 | 1328 | ||
1285 | static struct hstate *kobj_to_hstate(struct kobject *kobj) | 1329 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); |
1330 | |||
1331 | static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) | ||
1286 | { | 1332 | { |
1287 | int i; | 1333 | int i; |
1334 | |||
1288 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | 1335 | for (i = 0; i < HUGE_MAX_HSTATE; i++) |
1289 | if (hstate_kobjs[i] == kobj) | 1336 | if (hstate_kobjs[i] == kobj) { |
1337 | if (nidp) | ||
1338 | *nidp = NUMA_NO_NODE; | ||
1290 | return &hstates[i]; | 1339 | return &hstates[i]; |
1291 | BUG(); | 1340 | } |
1292 | return NULL; | 1341 | |
1342 | return kobj_to_node_hstate(kobj, nidp); | ||
1293 | } | 1343 | } |
1294 | 1344 | ||
1295 | static ssize_t nr_hugepages_show(struct kobject *kobj, | 1345 | static ssize_t nr_hugepages_show_common(struct kobject *kobj, |
1296 | struct kobj_attribute *attr, char *buf) | 1346 | struct kobj_attribute *attr, char *buf) |
1297 | { | 1347 | { |
1298 | struct hstate *h = kobj_to_hstate(kobj); | 1348 | struct hstate *h; |
1299 | return sprintf(buf, "%lu\n", h->nr_huge_pages); | 1349 | unsigned long nr_huge_pages; |
1350 | int nid; | ||
1351 | |||
1352 | h = kobj_to_hstate(kobj, &nid); | ||
1353 | if (nid == NUMA_NO_NODE) | ||
1354 | nr_huge_pages = h->nr_huge_pages; | ||
1355 | else | ||
1356 | nr_huge_pages = h->nr_huge_pages_node[nid]; | ||
1357 | |||
1358 | return sprintf(buf, "%lu\n", nr_huge_pages); | ||
1300 | } | 1359 | } |
1301 | static ssize_t nr_hugepages_store(struct kobject *kobj, | 1360 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, |
1302 | struct kobj_attribute *attr, const char *buf, size_t count) | 1361 | struct kobject *kobj, struct kobj_attribute *attr, |
1362 | const char *buf, size_t len) | ||
1303 | { | 1363 | { |
1304 | int err; | 1364 | int err; |
1305 | unsigned long input; | 1365 | int nid; |
1306 | struct hstate *h = kobj_to_hstate(kobj); | 1366 | unsigned long count; |
1367 | struct hstate *h; | ||
1368 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); | ||
1307 | 1369 | ||
1308 | err = strict_strtoul(buf, 10, &input); | 1370 | err = strict_strtoul(buf, 10, &count); |
1309 | if (err) | 1371 | if (err) |
1310 | return 0; | 1372 | return 0; |
1311 | 1373 | ||
1312 | h->max_huge_pages = set_max_huge_pages(h, input); | 1374 | h = kobj_to_hstate(kobj, &nid); |
1375 | if (nid == NUMA_NO_NODE) { | ||
1376 | /* | ||
1377 | * global hstate attribute | ||
1378 | */ | ||
1379 | if (!(obey_mempolicy && | ||
1380 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
1381 | NODEMASK_FREE(nodes_allowed); | ||
1382 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1383 | } | ||
1384 | } else if (nodes_allowed) { | ||
1385 | /* | ||
1386 | * per node hstate attribute: adjust count to global, | ||
1387 | * but restrict alloc/free to the specified node. | ||
1388 | */ | ||
1389 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; | ||
1390 | init_nodemask_of_node(nodes_allowed, nid); | ||
1391 | } else | ||
1392 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1313 | 1393 | ||
1314 | return count; | 1394 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); |
1395 | |||
1396 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | ||
1397 | NODEMASK_FREE(nodes_allowed); | ||
1398 | |||
1399 | return len; | ||
1400 | } | ||
1401 | |||
1402 | static ssize_t nr_hugepages_show(struct kobject *kobj, | ||
1403 | struct kobj_attribute *attr, char *buf) | ||
1404 | { | ||
1405 | return nr_hugepages_show_common(kobj, attr, buf); | ||
1406 | } | ||
1407 | |||
1408 | static ssize_t nr_hugepages_store(struct kobject *kobj, | ||
1409 | struct kobj_attribute *attr, const char *buf, size_t len) | ||
1410 | { | ||
1411 | return nr_hugepages_store_common(false, kobj, attr, buf, len); | ||
1315 | } | 1412 | } |
1316 | HSTATE_ATTR(nr_hugepages); | 1413 | HSTATE_ATTR(nr_hugepages); |
1317 | 1414 | ||
1415 | #ifdef CONFIG_NUMA | ||
1416 | |||
1417 | /* | ||
1418 | * hstate attribute for optionally mempolicy-based constraint on persistent | ||
1419 | * huge page alloc/free. | ||
1420 | */ | ||
1421 | static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, | ||
1422 | struct kobj_attribute *attr, char *buf) | ||
1423 | { | ||
1424 | return nr_hugepages_show_common(kobj, attr, buf); | ||
1425 | } | ||
1426 | |||
1427 | static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, | ||
1428 | struct kobj_attribute *attr, const char *buf, size_t len) | ||
1429 | { | ||
1430 | return nr_hugepages_store_common(true, kobj, attr, buf, len); | ||
1431 | } | ||
1432 | HSTATE_ATTR(nr_hugepages_mempolicy); | ||
1433 | #endif | ||
1434 | |||
1435 | |||
1318 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, | 1436 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, |
1319 | struct kobj_attribute *attr, char *buf) | 1437 | struct kobj_attribute *attr, char *buf) |
1320 | { | 1438 | { |
1321 | struct hstate *h = kobj_to_hstate(kobj); | 1439 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1322 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); | 1440 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); |
1323 | } | 1441 | } |
1324 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | 1442 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, |
@@ -1326,7 +1444,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | |||
1326 | { | 1444 | { |
1327 | int err; | 1445 | int err; |
1328 | unsigned long input; | 1446 | unsigned long input; |
1329 | struct hstate *h = kobj_to_hstate(kobj); | 1447 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1330 | 1448 | ||
1331 | err = strict_strtoul(buf, 10, &input); | 1449 | err = strict_strtoul(buf, 10, &input); |
1332 | if (err) | 1450 | if (err) |
@@ -1343,15 +1461,24 @@ HSTATE_ATTR(nr_overcommit_hugepages); | |||
1343 | static ssize_t free_hugepages_show(struct kobject *kobj, | 1461 | static ssize_t free_hugepages_show(struct kobject *kobj, |
1344 | struct kobj_attribute *attr, char *buf) | 1462 | struct kobj_attribute *attr, char *buf) |
1345 | { | 1463 | { |
1346 | struct hstate *h = kobj_to_hstate(kobj); | 1464 | struct hstate *h; |
1347 | return sprintf(buf, "%lu\n", h->free_huge_pages); | 1465 | unsigned long free_huge_pages; |
1466 | int nid; | ||
1467 | |||
1468 | h = kobj_to_hstate(kobj, &nid); | ||
1469 | if (nid == NUMA_NO_NODE) | ||
1470 | free_huge_pages = h->free_huge_pages; | ||
1471 | else | ||
1472 | free_huge_pages = h->free_huge_pages_node[nid]; | ||
1473 | |||
1474 | return sprintf(buf, "%lu\n", free_huge_pages); | ||
1348 | } | 1475 | } |
1349 | HSTATE_ATTR_RO(free_hugepages); | 1476 | HSTATE_ATTR_RO(free_hugepages); |
1350 | 1477 | ||
1351 | static ssize_t resv_hugepages_show(struct kobject *kobj, | 1478 | static ssize_t resv_hugepages_show(struct kobject *kobj, |
1352 | struct kobj_attribute *attr, char *buf) | 1479 | struct kobj_attribute *attr, char *buf) |
1353 | { | 1480 | { |
1354 | struct hstate *h = kobj_to_hstate(kobj); | 1481 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1355 | return sprintf(buf, "%lu\n", h->resv_huge_pages); | 1482 | return sprintf(buf, "%lu\n", h->resv_huge_pages); |
1356 | } | 1483 | } |
1357 | HSTATE_ATTR_RO(resv_hugepages); | 1484 | HSTATE_ATTR_RO(resv_hugepages); |
@@ -1359,8 +1486,17 @@ HSTATE_ATTR_RO(resv_hugepages); | |||
1359 | static ssize_t surplus_hugepages_show(struct kobject *kobj, | 1486 | static ssize_t surplus_hugepages_show(struct kobject *kobj, |
1360 | struct kobj_attribute *attr, char *buf) | 1487 | struct kobj_attribute *attr, char *buf) |
1361 | { | 1488 | { |
1362 | struct hstate *h = kobj_to_hstate(kobj); | 1489 | struct hstate *h; |
1363 | return sprintf(buf, "%lu\n", h->surplus_huge_pages); | 1490 | unsigned long surplus_huge_pages; |
1491 | int nid; | ||
1492 | |||
1493 | h = kobj_to_hstate(kobj, &nid); | ||
1494 | if (nid == NUMA_NO_NODE) | ||
1495 | surplus_huge_pages = h->surplus_huge_pages; | ||
1496 | else | ||
1497 | surplus_huge_pages = h->surplus_huge_pages_node[nid]; | ||
1498 | |||
1499 | return sprintf(buf, "%lu\n", surplus_huge_pages); | ||
1364 | } | 1500 | } |
1365 | HSTATE_ATTR_RO(surplus_hugepages); | 1501 | HSTATE_ATTR_RO(surplus_hugepages); |
1366 | 1502 | ||
@@ -1370,6 +1506,9 @@ static struct attribute *hstate_attrs[] = { | |||
1370 | &free_hugepages_attr.attr, | 1506 | &free_hugepages_attr.attr, |
1371 | &resv_hugepages_attr.attr, | 1507 | &resv_hugepages_attr.attr, |
1372 | &surplus_hugepages_attr.attr, | 1508 | &surplus_hugepages_attr.attr, |
1509 | #ifdef CONFIG_NUMA | ||
1510 | &nr_hugepages_mempolicy_attr.attr, | ||
1511 | #endif | ||
1373 | NULL, | 1512 | NULL, |
1374 | }; | 1513 | }; |
1375 | 1514 | ||
@@ -1377,19 +1516,20 @@ static struct attribute_group hstate_attr_group = { | |||
1377 | .attrs = hstate_attrs, | 1516 | .attrs = hstate_attrs, |
1378 | }; | 1517 | }; |
1379 | 1518 | ||
1380 | static int __init hugetlb_sysfs_add_hstate(struct hstate *h) | 1519 | static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, |
1520 | struct kobject **hstate_kobjs, | ||
1521 | struct attribute_group *hstate_attr_group) | ||
1381 | { | 1522 | { |
1382 | int retval; | 1523 | int retval; |
1524 | int hi = h - hstates; | ||
1383 | 1525 | ||
1384 | hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, | 1526 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); |
1385 | hugepages_kobj); | 1527 | if (!hstate_kobjs[hi]) |
1386 | if (!hstate_kobjs[h - hstates]) | ||
1387 | return -ENOMEM; | 1528 | return -ENOMEM; |
1388 | 1529 | ||
1389 | retval = sysfs_create_group(hstate_kobjs[h - hstates], | 1530 | retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); |
1390 | &hstate_attr_group); | ||
1391 | if (retval) | 1531 | if (retval) |
1392 | kobject_put(hstate_kobjs[h - hstates]); | 1532 | kobject_put(hstate_kobjs[hi]); |
1393 | 1533 | ||
1394 | return retval; | 1534 | return retval; |
1395 | } | 1535 | } |
@@ -1404,17 +1544,184 @@ static void __init hugetlb_sysfs_init(void) | |||
1404 | return; | 1544 | return; |
1405 | 1545 | ||
1406 | for_each_hstate(h) { | 1546 | for_each_hstate(h) { |
1407 | err = hugetlb_sysfs_add_hstate(h); | 1547 | err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, |
1548 | hstate_kobjs, &hstate_attr_group); | ||
1408 | if (err) | 1549 | if (err) |
1409 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", | 1550 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", |
1410 | h->name); | 1551 | h->name); |
1411 | } | 1552 | } |
1412 | } | 1553 | } |
1413 | 1554 | ||
1555 | #ifdef CONFIG_NUMA | ||
1556 | |||
1557 | /* | ||
1558 | * node_hstate/s - associate per node hstate attributes, via their kobjects, | ||
1559 | * with node sysdevs in node_devices[] using a parallel array. The array | ||
1560 | * index of a node sysdev or _hstate == node id. | ||
1561 | * This is here to avoid any static dependency of the node sysdev driver, in | ||
1562 | * the base kernel, on the hugetlb module. | ||
1563 | */ | ||
1564 | struct node_hstate { | ||
1565 | struct kobject *hugepages_kobj; | ||
1566 | struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | ||
1567 | }; | ||
1568 | struct node_hstate node_hstates[MAX_NUMNODES]; | ||
1569 | |||
1570 | /* | ||
1571 | * A subset of global hstate attributes for node sysdevs | ||
1572 | */ | ||
1573 | static struct attribute *per_node_hstate_attrs[] = { | ||
1574 | &nr_hugepages_attr.attr, | ||
1575 | &free_hugepages_attr.attr, | ||
1576 | &surplus_hugepages_attr.attr, | ||
1577 | NULL, | ||
1578 | }; | ||
1579 | |||
1580 | static struct attribute_group per_node_hstate_attr_group = { | ||
1581 | .attrs = per_node_hstate_attrs, | ||
1582 | }; | ||
1583 | |||
1584 | /* | ||
1585 | * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. | ||
1586 | * Returns node id via non-NULL nidp. | ||
1587 | */ | ||
1588 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | ||
1589 | { | ||
1590 | int nid; | ||
1591 | |||
1592 | for (nid = 0; nid < nr_node_ids; nid++) { | ||
1593 | struct node_hstate *nhs = &node_hstates[nid]; | ||
1594 | int i; | ||
1595 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | ||
1596 | if (nhs->hstate_kobjs[i] == kobj) { | ||
1597 | if (nidp) | ||
1598 | *nidp = nid; | ||
1599 | return &hstates[i]; | ||
1600 | } | ||
1601 | } | ||
1602 | |||
1603 | BUG(); | ||
1604 | return NULL; | ||
1605 | } | ||
1606 | |||
1607 | /* | ||
1608 | * Unregister hstate attributes from a single node sysdev. | ||
1609 | * No-op if no hstate attributes attached. | ||
1610 | */ | ||
1611 | void hugetlb_unregister_node(struct node *node) | ||
1612 | { | ||
1613 | struct hstate *h; | ||
1614 | struct node_hstate *nhs = &node_hstates[node->sysdev.id]; | ||
1615 | |||
1616 | if (!nhs->hugepages_kobj) | ||
1617 | return; /* no hstate attributes */ | ||
1618 | |||
1619 | for_each_hstate(h) | ||
1620 | if (nhs->hstate_kobjs[h - hstates]) { | ||
1621 | kobject_put(nhs->hstate_kobjs[h - hstates]); | ||
1622 | nhs->hstate_kobjs[h - hstates] = NULL; | ||
1623 | } | ||
1624 | |||
1625 | kobject_put(nhs->hugepages_kobj); | ||
1626 | nhs->hugepages_kobj = NULL; | ||
1627 | } | ||
1628 | |||
1629 | /* | ||
1630 | * hugetlb module exit: unregister hstate attributes from node sysdevs | ||
1631 | * that have them. | ||
1632 | */ | ||
1633 | static void hugetlb_unregister_all_nodes(void) | ||
1634 | { | ||
1635 | int nid; | ||
1636 | |||
1637 | /* | ||
1638 | * disable node sysdev registrations. | ||
1639 | */ | ||
1640 | register_hugetlbfs_with_node(NULL, NULL); | ||
1641 | |||
1642 | /* | ||
1643 | * remove hstate attributes from any nodes that have them. | ||
1644 | */ | ||
1645 | for (nid = 0; nid < nr_node_ids; nid++) | ||
1646 | hugetlb_unregister_node(&node_devices[nid]); | ||
1647 | } | ||
1648 | |||
1649 | /* | ||
1650 | * Register hstate attributes for a single node sysdev. | ||
1651 | * No-op if attributes already registered. | ||
1652 | */ | ||
1653 | void hugetlb_register_node(struct node *node) | ||
1654 | { | ||
1655 | struct hstate *h; | ||
1656 | struct node_hstate *nhs = &node_hstates[node->sysdev.id]; | ||
1657 | int err; | ||
1658 | |||
1659 | if (nhs->hugepages_kobj) | ||
1660 | return; /* already allocated */ | ||
1661 | |||
1662 | nhs->hugepages_kobj = kobject_create_and_add("hugepages", | ||
1663 | &node->sysdev.kobj); | ||
1664 | if (!nhs->hugepages_kobj) | ||
1665 | return; | ||
1666 | |||
1667 | for_each_hstate(h) { | ||
1668 | err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, | ||
1669 | nhs->hstate_kobjs, | ||
1670 | &per_node_hstate_attr_group); | ||
1671 | if (err) { | ||
1672 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s" | ||
1673 | " for node %d\n", | ||
1674 | h->name, node->sysdev.id); | ||
1675 | hugetlb_unregister_node(node); | ||
1676 | break; | ||
1677 | } | ||
1678 | } | ||
1679 | } | ||
1680 | |||
1681 | /* | ||
1682 | * hugetlb init time: register hstate attributes for all registered node | ||
1683 | * sysdevs of nodes that have memory. All on-line nodes should have | ||
1684 | * registered their associated sysdev by this time. | ||
1685 | */ | ||
1686 | static void hugetlb_register_all_nodes(void) | ||
1687 | { | ||
1688 | int nid; | ||
1689 | |||
1690 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
1691 | struct node *node = &node_devices[nid]; | ||
1692 | if (node->sysdev.id == nid) | ||
1693 | hugetlb_register_node(node); | ||
1694 | } | ||
1695 | |||
1696 | /* | ||
1697 | * Let the node sysdev driver know we're here so it can | ||
1698 | * [un]register hstate attributes on node hotplug. | ||
1699 | */ | ||
1700 | register_hugetlbfs_with_node(hugetlb_register_node, | ||
1701 | hugetlb_unregister_node); | ||
1702 | } | ||
1703 | #else /* !CONFIG_NUMA */ | ||
1704 | |||
1705 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | ||
1706 | { | ||
1707 | BUG(); | ||
1708 | if (nidp) | ||
1709 | *nidp = -1; | ||
1710 | return NULL; | ||
1711 | } | ||
1712 | |||
1713 | static void hugetlb_unregister_all_nodes(void) { } | ||
1714 | |||
1715 | static void hugetlb_register_all_nodes(void) { } | ||
1716 | |||
1717 | #endif | ||
1718 | |||
1414 | static void __exit hugetlb_exit(void) | 1719 | static void __exit hugetlb_exit(void) |
1415 | { | 1720 | { |
1416 | struct hstate *h; | 1721 | struct hstate *h; |
1417 | 1722 | ||
1723 | hugetlb_unregister_all_nodes(); | ||
1724 | |||
1418 | for_each_hstate(h) { | 1725 | for_each_hstate(h) { |
1419 | kobject_put(hstate_kobjs[h - hstates]); | 1726 | kobject_put(hstate_kobjs[h - hstates]); |
1420 | } | 1727 | } |
@@ -1449,6 +1756,8 @@ static int __init hugetlb_init(void) | |||
1449 | 1756 | ||
1450 | hugetlb_sysfs_init(); | 1757 | hugetlb_sysfs_init(); |
1451 | 1758 | ||
1759 | hugetlb_register_all_nodes(); | ||
1760 | |||
1452 | return 0; | 1761 | return 0; |
1453 | } | 1762 | } |
1454 | module_init(hugetlb_init); | 1763 | module_init(hugetlb_init); |
@@ -1472,8 +1781,8 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1472 | h->free_huge_pages = 0; | 1781 | h->free_huge_pages = 0; |
1473 | for (i = 0; i < MAX_NUMNODES; ++i) | 1782 | for (i = 0; i < MAX_NUMNODES; ++i) |
1474 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1783 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1475 | h->next_nid_to_alloc = first_node(node_online_map); | 1784 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); |
1476 | h->next_nid_to_free = first_node(node_online_map); | 1785 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); |
1477 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1786 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1478 | huge_page_size(h)/1024); | 1787 | huge_page_size(h)/1024); |
1479 | 1788 | ||
@@ -1536,9 +1845,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
1536 | } | 1845 | } |
1537 | 1846 | ||
1538 | #ifdef CONFIG_SYSCTL | 1847 | #ifdef CONFIG_SYSCTL |
1539 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1848 | static int hugetlb_sysctl_handler_common(bool obey_mempolicy, |
1540 | void __user *buffer, | 1849 | struct ctl_table *table, int write, |
1541 | size_t *length, loff_t *ppos) | 1850 | void __user *buffer, size_t *length, loff_t *ppos) |
1542 | { | 1851 | { |
1543 | struct hstate *h = &default_hstate; | 1852 | struct hstate *h = &default_hstate; |
1544 | unsigned long tmp; | 1853 | unsigned long tmp; |
@@ -1550,12 +1859,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, | |||
1550 | table->maxlen = sizeof(unsigned long); | 1859 | table->maxlen = sizeof(unsigned long); |
1551 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1860 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1552 | 1861 | ||
1553 | if (write) | 1862 | if (write) { |
1554 | h->max_huge_pages = set_max_huge_pages(h, tmp); | 1863 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, |
1864 | GFP_KERNEL | __GFP_NORETRY); | ||
1865 | if (!(obey_mempolicy && | ||
1866 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
1867 | NODEMASK_FREE(nodes_allowed); | ||
1868 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1869 | } | ||
1870 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); | ||
1871 | |||
1872 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | ||
1873 | NODEMASK_FREE(nodes_allowed); | ||
1874 | } | ||
1555 | 1875 | ||
1556 | return 0; | 1876 | return 0; |
1557 | } | 1877 | } |
1558 | 1878 | ||
1879 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | ||
1880 | void __user *buffer, size_t *length, loff_t *ppos) | ||
1881 | { | ||
1882 | |||
1883 | return hugetlb_sysctl_handler_common(false, table, write, | ||
1884 | buffer, length, ppos); | ||
1885 | } | ||
1886 | |||
1887 | #ifdef CONFIG_NUMA | ||
1888 | int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, | ||
1889 | void __user *buffer, size_t *length, loff_t *ppos) | ||
1890 | { | ||
1891 | return hugetlb_sysctl_handler_common(true, table, write, | ||
1892 | buffer, length, ppos); | ||
1893 | } | ||
1894 | #endif /* CONFIG_NUMA */ | ||
1895 | |||
1559 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, | 1896 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, |
1560 | void __user *buffer, | 1897 | void __user *buffer, |
1561 | size_t *length, loff_t *ppos) | 1898 | size_t *length, loff_t *ppos) |
@@ -1751,7 +2088,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
1751 | 2088 | ||
1752 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); | 2089 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); |
1753 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { | 2090 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { |
1754 | update_mmu_cache(vma, address, entry); | 2091 | update_mmu_cache(vma, address, ptep); |
1755 | } | 2092 | } |
1756 | } | 2093 | } |
1757 | 2094 | ||
@@ -1903,6 +2240,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1903 | + (vma->vm_pgoff >> PAGE_SHIFT); | 2240 | + (vma->vm_pgoff >> PAGE_SHIFT); |
1904 | mapping = (struct address_space *)page_private(page); | 2241 | mapping = (struct address_space *)page_private(page); |
1905 | 2242 | ||
2243 | /* | ||
2244 | * Take the mapping lock for the duration of the table walk. As | ||
2245 | * this mapping should be shared between all the VMAs, | ||
2246 | * __unmap_hugepage_range() is called as the lock is already held | ||
2247 | */ | ||
2248 | spin_lock(&mapping->i_mmap_lock); | ||
1906 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 2249 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1907 | /* Do not unmap the current VMA */ | 2250 | /* Do not unmap the current VMA */ |
1908 | if (iter_vma == vma) | 2251 | if (iter_vma == vma) |
@@ -1916,10 +2259,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1916 | * from the time of fork. This would look like data corruption | 2259 | * from the time of fork. This would look like data corruption |
1917 | */ | 2260 | */ |
1918 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | 2261 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) |
1919 | unmap_hugepage_range(iter_vma, | 2262 | __unmap_hugepage_range(iter_vma, |
1920 | address, address + huge_page_size(h), | 2263 | address, address + huge_page_size(h), |
1921 | page); | 2264 | page); |
1922 | } | 2265 | } |
2266 | spin_unlock(&mapping->i_mmap_lock); | ||
1923 | 2267 | ||
1924 | return 1; | 2268 | return 1; |
1925 | } | 2269 | } |
@@ -1959,6 +2303,9 @@ retry_avoidcopy: | |||
1959 | outside_reserve = 1; | 2303 | outside_reserve = 1; |
1960 | 2304 | ||
1961 | page_cache_get(old_page); | 2305 | page_cache_get(old_page); |
2306 | |||
2307 | /* Drop page_table_lock as buddy allocator may be called */ | ||
2308 | spin_unlock(&mm->page_table_lock); | ||
1962 | new_page = alloc_huge_page(vma, address, outside_reserve); | 2309 | new_page = alloc_huge_page(vma, address, outside_reserve); |
1963 | 2310 | ||
1964 | if (IS_ERR(new_page)) { | 2311 | if (IS_ERR(new_page)) { |
@@ -1976,19 +2323,25 @@ retry_avoidcopy: | |||
1976 | if (unmap_ref_private(mm, vma, old_page, address)) { | 2323 | if (unmap_ref_private(mm, vma, old_page, address)) { |
1977 | BUG_ON(page_count(old_page) != 1); | 2324 | BUG_ON(page_count(old_page) != 1); |
1978 | BUG_ON(huge_pte_none(pte)); | 2325 | BUG_ON(huge_pte_none(pte)); |
2326 | spin_lock(&mm->page_table_lock); | ||
1979 | goto retry_avoidcopy; | 2327 | goto retry_avoidcopy; |
1980 | } | 2328 | } |
1981 | WARN_ON_ONCE(1); | 2329 | WARN_ON_ONCE(1); |
1982 | } | 2330 | } |
1983 | 2331 | ||
2332 | /* Caller expects lock to be held */ | ||
2333 | spin_lock(&mm->page_table_lock); | ||
1984 | return -PTR_ERR(new_page); | 2334 | return -PTR_ERR(new_page); |
1985 | } | 2335 | } |
1986 | 2336 | ||
1987 | spin_unlock(&mm->page_table_lock); | ||
1988 | copy_huge_page(new_page, old_page, address, vma); | 2337 | copy_huge_page(new_page, old_page, address, vma); |
1989 | __SetPageUptodate(new_page); | 2338 | __SetPageUptodate(new_page); |
1990 | spin_lock(&mm->page_table_lock); | ||
1991 | 2339 | ||
2340 | /* | ||
2341 | * Retake the page_table_lock to check for racing updates | ||
2342 | * before the page tables are altered | ||
2343 | */ | ||
2344 | spin_lock(&mm->page_table_lock); | ||
1992 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2345 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
1993 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 2346 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
1994 | /* Break COW */ | 2347 | /* Break COW */ |
@@ -2095,8 +2448,10 @@ retry: | |||
2095 | spin_lock(&inode->i_lock); | 2448 | spin_lock(&inode->i_lock); |
2096 | inode->i_blocks += blocks_per_huge_page(h); | 2449 | inode->i_blocks += blocks_per_huge_page(h); |
2097 | spin_unlock(&inode->i_lock); | 2450 | spin_unlock(&inode->i_lock); |
2098 | } else | 2451 | } else { |
2099 | lock_page(page); | 2452 | lock_page(page); |
2453 | page->mapping = HUGETLB_POISON; | ||
2454 | } | ||
2100 | } | 2455 | } |
2101 | 2456 | ||
2102 | /* | 2457 | /* |
@@ -2206,7 +2561,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2206 | entry = pte_mkyoung(entry); | 2561 | entry = pte_mkyoung(entry); |
2207 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, | 2562 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, |
2208 | flags & FAULT_FLAG_WRITE)) | 2563 | flags & FAULT_FLAG_WRITE)) |
2209 | update_mmu_cache(vma, address, entry); | 2564 | update_mmu_cache(vma, address, ptep); |
2210 | 2565 | ||
2211 | out_page_table_lock: | 2566 | out_page_table_lock: |
2212 | spin_unlock(&mm->page_table_lock); | 2567 | spin_unlock(&mm->page_table_lock); |