diff options
Diffstat (limited to 'mm/hugetlb.c')
| -rw-r--r-- | mm/hugetlb.c | 551 |
1 files changed, 452 insertions, 99 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d7601b02874..65f38c218207 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -24,6 +24,7 @@ | |||
| 24 | #include <asm/io.h> | 24 | #include <asm/io.h> |
| 25 | 25 | ||
| 26 | #include <linux/hugetlb.h> | 26 | #include <linux/hugetlb.h> |
| 27 | #include <linux/node.h> | ||
| 27 | #include "internal.h" | 28 | #include "internal.h" |
| 28 | 29 | ||
| 29 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 30 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
| @@ -622,42 +623,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
| 622 | } | 623 | } |
| 623 | 624 | ||
| 624 | /* | 625 | /* |
| 625 | * Use a helper variable to find the next node and then | 626 | * common helper functions for hstate_next_node_to_{alloc|free}. |
| 626 | * copy it back to next_nid_to_alloc afterwards: | 627 | * We may have allocated or freed a huge page based on a different |
| 627 | * otherwise there's a window in which a racer might | 628 | * nodes_allowed previously, so h->next_node_to_{alloc|free} might |
| 628 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. | 629 | * be outside of *nodes_allowed. Ensure that we use an allowed |
| 629 | * But we don't need to use a spin_lock here: it really | 630 | * node for alloc or free. |
| 630 | * doesn't matter if occasionally a racer chooses the | ||
| 631 | * same nid as we do. Move nid forward in the mask even | ||
| 632 | * if we just successfully allocated a hugepage so that | ||
| 633 | * the next caller gets hugepages on the next node. | ||
| 634 | */ | 631 | */ |
| 635 | static int hstate_next_node_to_alloc(struct hstate *h) | 632 | static int next_node_allowed(int nid, nodemask_t *nodes_allowed) |
| 636 | { | 633 | { |
| 637 | int next_nid; | 634 | nid = next_node(nid, *nodes_allowed); |
| 638 | next_nid = next_node(h->next_nid_to_alloc, node_online_map); | 635 | if (nid == MAX_NUMNODES) |
| 639 | if (next_nid == MAX_NUMNODES) | 636 | nid = first_node(*nodes_allowed); |
| 640 | next_nid = first_node(node_online_map); | 637 | VM_BUG_ON(nid >= MAX_NUMNODES); |
| 641 | h->next_nid_to_alloc = next_nid; | 638 | |
| 642 | return next_nid; | 639 | return nid; |
| 640 | } | ||
| 641 | |||
| 642 | static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) | ||
| 643 | { | ||
| 644 | if (!node_isset(nid, *nodes_allowed)) | ||
| 645 | nid = next_node_allowed(nid, nodes_allowed); | ||
| 646 | return nid; | ||
| 647 | } | ||
| 648 | |||
| 649 | /* | ||
| 650 | * returns the previously saved node ["this node"] from which to | ||
| 651 | * allocate a persistent huge page for the pool and advance the | ||
| 652 | * next node from which to allocate, handling wrap at end of node | ||
| 653 | * mask. | ||
| 654 | */ | ||
| 655 | static int hstate_next_node_to_alloc(struct hstate *h, | ||
| 656 | nodemask_t *nodes_allowed) | ||
| 657 | { | ||
| 658 | int nid; | ||
| 659 | |||
| 660 | VM_BUG_ON(!nodes_allowed); | ||
| 661 | |||
| 662 | nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); | ||
| 663 | h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); | ||
| 664 | |||
| 665 | return nid; | ||
| 643 | } | 666 | } |
| 644 | 667 | ||
| 645 | static int alloc_fresh_huge_page(struct hstate *h) | 668 | static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) |
| 646 | { | 669 | { |
| 647 | struct page *page; | 670 | struct page *page; |
| 648 | int start_nid; | 671 | int start_nid; |
| 649 | int next_nid; | 672 | int next_nid; |
| 650 | int ret = 0; | 673 | int ret = 0; |
| 651 | 674 | ||
| 652 | start_nid = h->next_nid_to_alloc; | 675 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
| 653 | next_nid = start_nid; | 676 | next_nid = start_nid; |
| 654 | 677 | ||
| 655 | do { | 678 | do { |
| 656 | page = alloc_fresh_huge_page_node(h, next_nid); | 679 | page = alloc_fresh_huge_page_node(h, next_nid); |
| 657 | if (page) | 680 | if (page) { |
| 658 | ret = 1; | 681 | ret = 1; |
| 659 | next_nid = hstate_next_node_to_alloc(h); | 682 | break; |
| 660 | } while (!page && next_nid != start_nid); | 683 | } |
| 684 | next_nid = hstate_next_node_to_alloc(h, nodes_allowed); | ||
| 685 | } while (next_nid != start_nid); | ||
| 661 | 686 | ||
| 662 | if (ret) | 687 | if (ret) |
| 663 | count_vm_event(HTLB_BUDDY_PGALLOC); | 688 | count_vm_event(HTLB_BUDDY_PGALLOC); |
| @@ -668,17 +693,21 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
| 668 | } | 693 | } |
| 669 | 694 | ||
| 670 | /* | 695 | /* |
| 671 | * helper for free_pool_huge_page() - find next node | 696 | * helper for free_pool_huge_page() - return the previously saved |
| 672 | * from which to free a huge page | 697 | * node ["this node"] from which to free a huge page. Advance the |
| 698 | * next node id whether or not we find a free huge page to free so | ||
| 699 | * that the next attempt to free addresses the next node. | ||
| 673 | */ | 700 | */ |
| 674 | static int hstate_next_node_to_free(struct hstate *h) | 701 | static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) |
| 675 | { | 702 | { |
| 676 | int next_nid; | 703 | int nid; |
| 677 | next_nid = next_node(h->next_nid_to_free, node_online_map); | 704 | |
| 678 | if (next_nid == MAX_NUMNODES) | 705 | VM_BUG_ON(!nodes_allowed); |
| 679 | next_nid = first_node(node_online_map); | 706 | |
| 680 | h->next_nid_to_free = next_nid; | 707 | nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); |
| 681 | return next_nid; | 708 | h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); |
| 709 | |||
| 710 | return nid; | ||
| 682 | } | 711 | } |
| 683 | 712 | ||
| 684 | /* | 713 | /* |
| @@ -687,13 +716,14 @@ static int hstate_next_node_to_free(struct hstate *h) | |||
| 687 | * balanced over allowed nodes. | 716 | * balanced over allowed nodes. |
| 688 | * Called with hugetlb_lock locked. | 717 | * Called with hugetlb_lock locked. |
| 689 | */ | 718 | */ |
| 690 | static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | 719 | static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, |
| 720 | bool acct_surplus) | ||
| 691 | { | 721 | { |
| 692 | int start_nid; | 722 | int start_nid; |
| 693 | int next_nid; | 723 | int next_nid; |
| 694 | int ret = 0; | 724 | int ret = 0; |
| 695 | 725 | ||
| 696 | start_nid = h->next_nid_to_free; | 726 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
| 697 | next_nid = start_nid; | 727 | next_nid = start_nid; |
| 698 | 728 | ||
| 699 | do { | 729 | do { |
| @@ -715,9 +745,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | |||
| 715 | } | 745 | } |
| 716 | update_and_free_page(h, page); | 746 | update_and_free_page(h, page); |
| 717 | ret = 1; | 747 | ret = 1; |
| 748 | break; | ||
| 718 | } | 749 | } |
| 719 | next_nid = hstate_next_node_to_free(h); | 750 | next_nid = hstate_next_node_to_free(h, nodes_allowed); |
| 720 | } while (!ret && next_nid != start_nid); | 751 | } while (next_nid != start_nid); |
| 721 | 752 | ||
| 722 | return ret; | 753 | return ret; |
| 723 | } | 754 | } |
| @@ -911,14 +942,14 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
| 911 | 942 | ||
| 912 | /* | 943 | /* |
| 913 | * We want to release as many surplus pages as possible, spread | 944 | * We want to release as many surplus pages as possible, spread |
| 914 | * evenly across all nodes. Iterate across all nodes until we | 945 | * evenly across all nodes with memory. Iterate across these nodes |
| 915 | * can no longer free unreserved surplus pages. This occurs when | 946 | * until we can no longer free unreserved surplus pages. This occurs |
| 916 | * the nodes with surplus pages have no free pages. | 947 | * when the nodes with surplus pages have no free pages. |
| 917 | * free_pool_huge_page() will balance the the frees across the | 948 | * free_pool_huge_page() will balance the the freed pages across the |
| 918 | * on-line nodes for us and will handle the hstate accounting. | 949 | * on-line nodes with memory and will handle the hstate accounting. |
| 919 | */ | 950 | */ |
| 920 | while (nr_pages--) { | 951 | while (nr_pages--) { |
| 921 | if (!free_pool_huge_page(h, 1)) | 952 | if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) |
| 922 | break; | 953 | break; |
| 923 | } | 954 | } |
| 924 | } | 955 | } |
| @@ -1022,16 +1053,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
| 1022 | int __weak alloc_bootmem_huge_page(struct hstate *h) | 1053 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
| 1023 | { | 1054 | { |
| 1024 | struct huge_bootmem_page *m; | 1055 | struct huge_bootmem_page *m; |
| 1025 | int nr_nodes = nodes_weight(node_online_map); | 1056 | int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); |
| 1026 | 1057 | ||
| 1027 | while (nr_nodes) { | 1058 | while (nr_nodes) { |
| 1028 | void *addr; | 1059 | void *addr; |
| 1029 | 1060 | ||
| 1030 | addr = __alloc_bootmem_node_nopanic( | 1061 | addr = __alloc_bootmem_node_nopanic( |
| 1031 | NODE_DATA(h->next_nid_to_alloc), | 1062 | NODE_DATA(hstate_next_node_to_alloc(h, |
| 1063 | &node_states[N_HIGH_MEMORY])), | ||
| 1032 | huge_page_size(h), huge_page_size(h), 0); | 1064 | huge_page_size(h), huge_page_size(h), 0); |
| 1033 | 1065 | ||
| 1034 | hstate_next_node_to_alloc(h); | ||
| 1035 | if (addr) { | 1066 | if (addr) { |
| 1036 | /* | 1067 | /* |
| 1037 | * Use the beginning of the huge page to store the | 1068 | * Use the beginning of the huge page to store the |
| @@ -1084,7 +1115,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
| 1084 | if (h->order >= MAX_ORDER) { | 1115 | if (h->order >= MAX_ORDER) { |
| 1085 | if (!alloc_bootmem_huge_page(h)) | 1116 | if (!alloc_bootmem_huge_page(h)) |
| 1086 | break; | 1117 | break; |
| 1087 | } else if (!alloc_fresh_huge_page(h)) | 1118 | } else if (!alloc_fresh_huge_page(h, |
| 1119 | &node_states[N_HIGH_MEMORY])) | ||
| 1088 | break; | 1120 | break; |
| 1089 | } | 1121 | } |
| 1090 | h->max_huge_pages = i; | 1122 | h->max_huge_pages = i; |
| @@ -1126,14 +1158,15 @@ static void __init report_hugepages(void) | |||
| 1126 | } | 1158 | } |
| 1127 | 1159 | ||
| 1128 | #ifdef CONFIG_HIGHMEM | 1160 | #ifdef CONFIG_HIGHMEM |
| 1129 | static void try_to_free_low(struct hstate *h, unsigned long count) | 1161 | static void try_to_free_low(struct hstate *h, unsigned long count, |
| 1162 | nodemask_t *nodes_allowed) | ||
| 1130 | { | 1163 | { |
| 1131 | int i; | 1164 | int i; |
| 1132 | 1165 | ||
| 1133 | if (h->order >= MAX_ORDER) | 1166 | if (h->order >= MAX_ORDER) |
| 1134 | return; | 1167 | return; |
| 1135 | 1168 | ||
| 1136 | for (i = 0; i < MAX_NUMNODES; ++i) { | 1169 | for_each_node_mask(i, *nodes_allowed) { |
| 1137 | struct page *page, *next; | 1170 | struct page *page, *next; |
| 1138 | struct list_head *freel = &h->hugepage_freelists[i]; | 1171 | struct list_head *freel = &h->hugepage_freelists[i]; |
| 1139 | list_for_each_entry_safe(page, next, freel, lru) { | 1172 | list_for_each_entry_safe(page, next, freel, lru) { |
| @@ -1149,7 +1182,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count) | |||
| 1149 | } | 1182 | } |
| 1150 | } | 1183 | } |
| 1151 | #else | 1184 | #else |
| 1152 | static inline void try_to_free_low(struct hstate *h, unsigned long count) | 1185 | static inline void try_to_free_low(struct hstate *h, unsigned long count, |
| 1186 | nodemask_t *nodes_allowed) | ||
| 1153 | { | 1187 | { |
| 1154 | } | 1188 | } |
| 1155 | #endif | 1189 | #endif |
| @@ -1159,7 +1193,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
| 1159 | * balanced by operating on them in a round-robin fashion. | 1193 | * balanced by operating on them in a round-robin fashion. |
| 1160 | * Returns 1 if an adjustment was made. | 1194 | * Returns 1 if an adjustment was made. |
| 1161 | */ | 1195 | */ |
| 1162 | static int adjust_pool_surplus(struct hstate *h, int delta) | 1196 | static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, |
| 1197 | int delta) | ||
| 1163 | { | 1198 | { |
| 1164 | int start_nid, next_nid; | 1199 | int start_nid, next_nid; |
| 1165 | int ret = 0; | 1200 | int ret = 0; |
| @@ -1167,29 +1202,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
| 1167 | VM_BUG_ON(delta != -1 && delta != 1); | 1202 | VM_BUG_ON(delta != -1 && delta != 1); |
| 1168 | 1203 | ||
| 1169 | if (delta < 0) | 1204 | if (delta < 0) |
| 1170 | start_nid = h->next_nid_to_alloc; | 1205 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
| 1171 | else | 1206 | else |
| 1172 | start_nid = h->next_nid_to_free; | 1207 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
| 1173 | next_nid = start_nid; | 1208 | next_nid = start_nid; |
| 1174 | 1209 | ||
| 1175 | do { | 1210 | do { |
| 1176 | int nid = next_nid; | 1211 | int nid = next_nid; |
| 1177 | if (delta < 0) { | 1212 | if (delta < 0) { |
| 1178 | next_nid = hstate_next_node_to_alloc(h); | ||
| 1179 | /* | 1213 | /* |
| 1180 | * To shrink on this node, there must be a surplus page | 1214 | * To shrink on this node, there must be a surplus page |
| 1181 | */ | 1215 | */ |
| 1182 | if (!h->surplus_huge_pages_node[nid]) | 1216 | if (!h->surplus_huge_pages_node[nid]) { |
| 1217 | next_nid = hstate_next_node_to_alloc(h, | ||
| 1218 | nodes_allowed); | ||
| 1183 | continue; | 1219 | continue; |
| 1220 | } | ||
| 1184 | } | 1221 | } |
| 1185 | if (delta > 0) { | 1222 | if (delta > 0) { |
| 1186 | next_nid = hstate_next_node_to_free(h); | ||
| 1187 | /* | 1223 | /* |
| 1188 | * Surplus cannot exceed the total number of pages | 1224 | * Surplus cannot exceed the total number of pages |
| 1189 | */ | 1225 | */ |
| 1190 | if (h->surplus_huge_pages_node[nid] >= | 1226 | if (h->surplus_huge_pages_node[nid] >= |
| 1191 | h->nr_huge_pages_node[nid]) | 1227 | h->nr_huge_pages_node[nid]) { |
| 1228 | next_nid = hstate_next_node_to_free(h, | ||
| 1229 | nodes_allowed); | ||
| 1192 | continue; | 1230 | continue; |
| 1231 | } | ||
| 1193 | } | 1232 | } |
| 1194 | 1233 | ||
| 1195 | h->surplus_huge_pages += delta; | 1234 | h->surplus_huge_pages += delta; |
| @@ -1202,7 +1241,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
| 1202 | } | 1241 | } |
| 1203 | 1242 | ||
| 1204 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) | 1243 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
| 1205 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | 1244 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, |
| 1245 | nodemask_t *nodes_allowed) | ||
| 1206 | { | 1246 | { |
| 1207 | unsigned long min_count, ret; | 1247 | unsigned long min_count, ret; |
| 1208 | 1248 | ||
| @@ -1222,7 +1262,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
| 1222 | */ | 1262 | */ |
| 1223 | spin_lock(&hugetlb_lock); | 1263 | spin_lock(&hugetlb_lock); |
| 1224 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { | 1264 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { |
| 1225 | if (!adjust_pool_surplus(h, -1)) | 1265 | if (!adjust_pool_surplus(h, nodes_allowed, -1)) |
| 1226 | break; | 1266 | break; |
| 1227 | } | 1267 | } |
| 1228 | 1268 | ||
| @@ -1233,11 +1273,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
| 1233 | * and reducing the surplus. | 1273 | * and reducing the surplus. |
| 1234 | */ | 1274 | */ |
| 1235 | spin_unlock(&hugetlb_lock); | 1275 | spin_unlock(&hugetlb_lock); |
| 1236 | ret = alloc_fresh_huge_page(h); | 1276 | ret = alloc_fresh_huge_page(h, nodes_allowed); |
| 1237 | spin_lock(&hugetlb_lock); | 1277 | spin_lock(&hugetlb_lock); |
| 1238 | if (!ret) | 1278 | if (!ret) |
| 1239 | goto out; | 1279 | goto out; |
| 1240 | 1280 | ||
| 1281 | /* Bail for signals. Probably ctrl-c from user */ | ||
| 1282 | if (signal_pending(current)) | ||
| 1283 | goto out; | ||
| 1241 | } | 1284 | } |
| 1242 | 1285 | ||
| 1243 | /* | 1286 | /* |
| @@ -1257,13 +1300,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
| 1257 | */ | 1300 | */ |
| 1258 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; | 1301 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; |
| 1259 | min_count = max(count, min_count); | 1302 | min_count = max(count, min_count); |
| 1260 | try_to_free_low(h, min_count); | 1303 | try_to_free_low(h, min_count, nodes_allowed); |
| 1261 | while (min_count < persistent_huge_pages(h)) { | 1304 | while (min_count < persistent_huge_pages(h)) { |
| 1262 | if (!free_pool_huge_page(h, 0)) | 1305 | if (!free_pool_huge_page(h, nodes_allowed, 0)) |
| 1263 | break; | 1306 | break; |
| 1264 | } | 1307 | } |
| 1265 | while (count < persistent_huge_pages(h)) { | 1308 | while (count < persistent_huge_pages(h)) { |
| 1266 | if (!adjust_pool_surplus(h, 1)) | 1309 | if (!adjust_pool_surplus(h, nodes_allowed, 1)) |
| 1267 | break; | 1310 | break; |
| 1268 | } | 1311 | } |
| 1269 | out: | 1312 | out: |
| @@ -1282,43 +1325,117 @@ out: | |||
| 1282 | static struct kobject *hugepages_kobj; | 1325 | static struct kobject *hugepages_kobj; |
| 1283 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | 1326 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; |
| 1284 | 1327 | ||
| 1285 | static struct hstate *kobj_to_hstate(struct kobject *kobj) | 1328 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); |
| 1329 | |||
| 1330 | static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) | ||
| 1286 | { | 1331 | { |
| 1287 | int i; | 1332 | int i; |
| 1333 | |||
| 1288 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | 1334 | for (i = 0; i < HUGE_MAX_HSTATE; i++) |
| 1289 | if (hstate_kobjs[i] == kobj) | 1335 | if (hstate_kobjs[i] == kobj) { |
| 1336 | if (nidp) | ||
| 1337 | *nidp = NUMA_NO_NODE; | ||
| 1290 | return &hstates[i]; | 1338 | return &hstates[i]; |
| 1291 | BUG(); | 1339 | } |
| 1292 | return NULL; | 1340 | |
| 1341 | return kobj_to_node_hstate(kobj, nidp); | ||
| 1293 | } | 1342 | } |
| 1294 | 1343 | ||
| 1295 | static ssize_t nr_hugepages_show(struct kobject *kobj, | 1344 | static ssize_t nr_hugepages_show_common(struct kobject *kobj, |
| 1296 | struct kobj_attribute *attr, char *buf) | 1345 | struct kobj_attribute *attr, char *buf) |
| 1297 | { | 1346 | { |
| 1298 | struct hstate *h = kobj_to_hstate(kobj); | 1347 | struct hstate *h; |
| 1299 | return sprintf(buf, "%lu\n", h->nr_huge_pages); | 1348 | unsigned long nr_huge_pages; |
| 1349 | int nid; | ||
| 1350 | |||
| 1351 | h = kobj_to_hstate(kobj, &nid); | ||
| 1352 | if (nid == NUMA_NO_NODE) | ||
| 1353 | nr_huge_pages = h->nr_huge_pages; | ||
| 1354 | else | ||
| 1355 | nr_huge_pages = h->nr_huge_pages_node[nid]; | ||
| 1356 | |||
| 1357 | return sprintf(buf, "%lu\n", nr_huge_pages); | ||
| 1300 | } | 1358 | } |
| 1301 | static ssize_t nr_hugepages_store(struct kobject *kobj, | 1359 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, |
| 1302 | struct kobj_attribute *attr, const char *buf, size_t count) | 1360 | struct kobject *kobj, struct kobj_attribute *attr, |
| 1361 | const char *buf, size_t len) | ||
| 1303 | { | 1362 | { |
| 1304 | int err; | 1363 | int err; |
| 1305 | unsigned long input; | 1364 | int nid; |
| 1306 | struct hstate *h = kobj_to_hstate(kobj); | 1365 | unsigned long count; |
| 1366 | struct hstate *h; | ||
| 1367 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); | ||
| 1307 | 1368 | ||
| 1308 | err = strict_strtoul(buf, 10, &input); | 1369 | err = strict_strtoul(buf, 10, &count); |
| 1309 | if (err) | 1370 | if (err) |
| 1310 | return 0; | 1371 | return 0; |
| 1311 | 1372 | ||
| 1312 | h->max_huge_pages = set_max_huge_pages(h, input); | 1373 | h = kobj_to_hstate(kobj, &nid); |
| 1374 | if (nid == NUMA_NO_NODE) { | ||
| 1375 | /* | ||
| 1376 | * global hstate attribute | ||
| 1377 | */ | ||
| 1378 | if (!(obey_mempolicy && | ||
| 1379 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
| 1380 | NODEMASK_FREE(nodes_allowed); | ||
| 1381 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
| 1382 | } | ||
| 1383 | } else if (nodes_allowed) { | ||
| 1384 | /* | ||
| 1385 | * per node hstate attribute: adjust count to global, | ||
| 1386 | * but restrict alloc/free to the specified node. | ||
| 1387 | */ | ||
| 1388 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; | ||
| 1389 | init_nodemask_of_node(nodes_allowed, nid); | ||
| 1390 | } else | ||
| 1391 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
| 1392 | |||
| 1393 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); | ||
| 1313 | 1394 | ||
| 1314 | return count; | 1395 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) |
| 1396 | NODEMASK_FREE(nodes_allowed); | ||
| 1397 | |||
| 1398 | return len; | ||
| 1399 | } | ||
| 1400 | |||
| 1401 | static ssize_t nr_hugepages_show(struct kobject *kobj, | ||
| 1402 | struct kobj_attribute *attr, char *buf) | ||
| 1403 | { | ||
| 1404 | return nr_hugepages_show_common(kobj, attr, buf); | ||
| 1405 | } | ||
| 1406 | |||
| 1407 | static ssize_t nr_hugepages_store(struct kobject *kobj, | ||
| 1408 | struct kobj_attribute *attr, const char *buf, size_t len) | ||
| 1409 | { | ||
| 1410 | return nr_hugepages_store_common(false, kobj, attr, buf, len); | ||
| 1315 | } | 1411 | } |
| 1316 | HSTATE_ATTR(nr_hugepages); | 1412 | HSTATE_ATTR(nr_hugepages); |
| 1317 | 1413 | ||
| 1414 | #ifdef CONFIG_NUMA | ||
| 1415 | |||
| 1416 | /* | ||
| 1417 | * hstate attribute for optionally mempolicy-based constraint on persistent | ||
| 1418 | * huge page alloc/free. | ||
| 1419 | */ | ||
| 1420 | static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, | ||
| 1421 | struct kobj_attribute *attr, char *buf) | ||
| 1422 | { | ||
| 1423 | return nr_hugepages_show_common(kobj, attr, buf); | ||
| 1424 | } | ||
| 1425 | |||
| 1426 | static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, | ||
| 1427 | struct kobj_attribute *attr, const char *buf, size_t len) | ||
| 1428 | { | ||
| 1429 | return nr_hugepages_store_common(true, kobj, attr, buf, len); | ||
| 1430 | } | ||
| 1431 | HSTATE_ATTR(nr_hugepages_mempolicy); | ||
| 1432 | #endif | ||
| 1433 | |||
| 1434 | |||
| 1318 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, | 1435 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, |
| 1319 | struct kobj_attribute *attr, char *buf) | 1436 | struct kobj_attribute *attr, char *buf) |
| 1320 | { | 1437 | { |
| 1321 | struct hstate *h = kobj_to_hstate(kobj); | 1438 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
| 1322 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); | 1439 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); |
| 1323 | } | 1440 | } |
| 1324 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | 1441 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, |
| @@ -1326,7 +1443,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | |||
| 1326 | { | 1443 | { |
| 1327 | int err; | 1444 | int err; |
| 1328 | unsigned long input; | 1445 | unsigned long input; |
| 1329 | struct hstate *h = kobj_to_hstate(kobj); | 1446 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
| 1330 | 1447 | ||
| 1331 | err = strict_strtoul(buf, 10, &input); | 1448 | err = strict_strtoul(buf, 10, &input); |
| 1332 | if (err) | 1449 | if (err) |
| @@ -1343,15 +1460,24 @@ HSTATE_ATTR(nr_overcommit_hugepages); | |||
| 1343 | static ssize_t free_hugepages_show(struct kobject *kobj, | 1460 | static ssize_t free_hugepages_show(struct kobject *kobj, |
| 1344 | struct kobj_attribute *attr, char *buf) | 1461 | struct kobj_attribute *attr, char *buf) |
| 1345 | { | 1462 | { |
| 1346 | struct hstate *h = kobj_to_hstate(kobj); | 1463 | struct hstate *h; |
| 1347 | return sprintf(buf, "%lu\n", h->free_huge_pages); | 1464 | unsigned long free_huge_pages; |
| 1465 | int nid; | ||
| 1466 | |||
| 1467 | h = kobj_to_hstate(kobj, &nid); | ||
| 1468 | if (nid == NUMA_NO_NODE) | ||
| 1469 | free_huge_pages = h->free_huge_pages; | ||
| 1470 | else | ||
| 1471 | free_huge_pages = h->free_huge_pages_node[nid]; | ||
| 1472 | |||
| 1473 | return sprintf(buf, "%lu\n", free_huge_pages); | ||
| 1348 | } | 1474 | } |
| 1349 | HSTATE_ATTR_RO(free_hugepages); | 1475 | HSTATE_ATTR_RO(free_hugepages); |
| 1350 | 1476 | ||
| 1351 | static ssize_t resv_hugepages_show(struct kobject *kobj, | 1477 | static ssize_t resv_hugepages_show(struct kobject *kobj, |
| 1352 | struct kobj_attribute *attr, char *buf) | 1478 | struct kobj_attribute *attr, char *buf) |
| 1353 | { | 1479 | { |
| 1354 | struct hstate *h = kobj_to_hstate(kobj); | 1480 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
| 1355 | return sprintf(buf, "%lu\n", h->resv_huge_pages); | 1481 | return sprintf(buf, "%lu\n", h->resv_huge_pages); |
| 1356 | } | 1482 | } |
| 1357 | HSTATE_ATTR_RO(resv_hugepages); | 1483 | HSTATE_ATTR_RO(resv_hugepages); |
| @@ -1359,8 +1485,17 @@ HSTATE_ATTR_RO(resv_hugepages); | |||
| 1359 | static ssize_t surplus_hugepages_show(struct kobject *kobj, | 1485 | static ssize_t surplus_hugepages_show(struct kobject *kobj, |
| 1360 | struct kobj_attribute *attr, char *buf) | 1486 | struct kobj_attribute *attr, char *buf) |
| 1361 | { | 1487 | { |
| 1362 | struct hstate *h = kobj_to_hstate(kobj); | 1488 | struct hstate *h; |
| 1363 | return sprintf(buf, "%lu\n", h->surplus_huge_pages); | 1489 | unsigned long surplus_huge_pages; |
| 1490 | int nid; | ||
| 1491 | |||
| 1492 | h = kobj_to_hstate(kobj, &nid); | ||
| 1493 | if (nid == NUMA_NO_NODE) | ||
| 1494 | surplus_huge_pages = h->surplus_huge_pages; | ||
| 1495 | else | ||
| 1496 | surplus_huge_pages = h->surplus_huge_pages_node[nid]; | ||
| 1497 | |||
| 1498 | return sprintf(buf, "%lu\n", surplus_huge_pages); | ||
| 1364 | } | 1499 | } |
| 1365 | HSTATE_ATTR_RO(surplus_hugepages); | 1500 | HSTATE_ATTR_RO(surplus_hugepages); |
| 1366 | 1501 | ||
| @@ -1370,6 +1505,9 @@ static struct attribute *hstate_attrs[] = { | |||
| 1370 | &free_hugepages_attr.attr, | 1505 | &free_hugepages_attr.attr, |
| 1371 | &resv_hugepages_attr.attr, | 1506 | &resv_hugepages_attr.attr, |
| 1372 | &surplus_hugepages_attr.attr, | 1507 | &surplus_hugepages_attr.attr, |
| 1508 | #ifdef CONFIG_NUMA | ||
| 1509 | &nr_hugepages_mempolicy_attr.attr, | ||
| 1510 | #endif | ||
| 1373 | NULL, | 1511 | NULL, |
| 1374 | }; | 1512 | }; |
| 1375 | 1513 | ||
| @@ -1377,19 +1515,21 @@ static struct attribute_group hstate_attr_group = { | |||
| 1377 | .attrs = hstate_attrs, | 1515 | .attrs = hstate_attrs, |
| 1378 | }; | 1516 | }; |
| 1379 | 1517 | ||
| 1380 | static int __init hugetlb_sysfs_add_hstate(struct hstate *h) | 1518 | static int __init hugetlb_sysfs_add_hstate(struct hstate *h, |
| 1519 | struct kobject *parent, | ||
| 1520 | struct kobject **hstate_kobjs, | ||
| 1521 | struct attribute_group *hstate_attr_group) | ||
| 1381 | { | 1522 | { |
| 1382 | int retval; | 1523 | int retval; |
| 1524 | int hi = h - hstates; | ||
| 1383 | 1525 | ||
| 1384 | hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, | 1526 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); |
| 1385 | hugepages_kobj); | 1527 | if (!hstate_kobjs[hi]) |
| 1386 | if (!hstate_kobjs[h - hstates]) | ||
| 1387 | return -ENOMEM; | 1528 | return -ENOMEM; |
| 1388 | 1529 | ||
| 1389 | retval = sysfs_create_group(hstate_kobjs[h - hstates], | 1530 | retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); |
| 1390 | &hstate_attr_group); | ||
| 1391 | if (retval) | 1531 | if (retval) |
| 1392 | kobject_put(hstate_kobjs[h - hstates]); | 1532 | kobject_put(hstate_kobjs[hi]); |
| 1393 | 1533 | ||
| 1394 | return retval; | 1534 | return retval; |
| 1395 | } | 1535 | } |
| @@ -1404,17 +1544,184 @@ static void __init hugetlb_sysfs_init(void) | |||
| 1404 | return; | 1544 | return; |
| 1405 | 1545 | ||
| 1406 | for_each_hstate(h) { | 1546 | for_each_hstate(h) { |
| 1407 | err = hugetlb_sysfs_add_hstate(h); | 1547 | err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, |
| 1548 | hstate_kobjs, &hstate_attr_group); | ||
| 1408 | if (err) | 1549 | if (err) |
| 1409 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", | 1550 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", |
| 1410 | h->name); | 1551 | h->name); |
| 1411 | } | 1552 | } |
| 1412 | } | 1553 | } |
| 1413 | 1554 | ||
| 1555 | #ifdef CONFIG_NUMA | ||
| 1556 | |||
| 1557 | /* | ||
| 1558 | * node_hstate/s - associate per node hstate attributes, via their kobjects, | ||
| 1559 | * with node sysdevs in node_devices[] using a parallel array. The array | ||
| 1560 | * index of a node sysdev or _hstate == node id. | ||
| 1561 | * This is here to avoid any static dependency of the node sysdev driver, in | ||
| 1562 | * the base kernel, on the hugetlb module. | ||
| 1563 | */ | ||
| 1564 | struct node_hstate { | ||
| 1565 | struct kobject *hugepages_kobj; | ||
| 1566 | struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | ||
| 1567 | }; | ||
| 1568 | struct node_hstate node_hstates[MAX_NUMNODES]; | ||
| 1569 | |||
| 1570 | /* | ||
| 1571 | * A subset of global hstate attributes for node sysdevs | ||
| 1572 | */ | ||
| 1573 | static struct attribute *per_node_hstate_attrs[] = { | ||
| 1574 | &nr_hugepages_attr.attr, | ||
| 1575 | &free_hugepages_attr.attr, | ||
| 1576 | &surplus_hugepages_attr.attr, | ||
| 1577 | NULL, | ||
| 1578 | }; | ||
| 1579 | |||
| 1580 | static struct attribute_group per_node_hstate_attr_group = { | ||
| 1581 | .attrs = per_node_hstate_attrs, | ||
| 1582 | }; | ||
| 1583 | |||
| 1584 | /* | ||
| 1585 | * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. | ||
| 1586 | * Returns node id via non-NULL nidp. | ||
| 1587 | */ | ||
| 1588 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | ||
| 1589 | { | ||
| 1590 | int nid; | ||
| 1591 | |||
| 1592 | for (nid = 0; nid < nr_node_ids; nid++) { | ||
| 1593 | struct node_hstate *nhs = &node_hstates[nid]; | ||
| 1594 | int i; | ||
| 1595 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | ||
| 1596 | if (nhs->hstate_kobjs[i] == kobj) { | ||
| 1597 | if (nidp) | ||
| 1598 | *nidp = nid; | ||
| 1599 | return &hstates[i]; | ||
| 1600 | } | ||
| 1601 | } | ||
| 1602 | |||
| 1603 | BUG(); | ||
| 1604 | return NULL; | ||
| 1605 | } | ||
| 1606 | |||
| 1607 | /* | ||
| 1608 | * Unregister hstate attributes from a single node sysdev. | ||
| 1609 | * No-op if no hstate attributes attached. | ||
| 1610 | */ | ||
| 1611 | void hugetlb_unregister_node(struct node *node) | ||
| 1612 | { | ||
| 1613 | struct hstate *h; | ||
| 1614 | struct node_hstate *nhs = &node_hstates[node->sysdev.id]; | ||
| 1615 | |||
| 1616 | if (!nhs->hugepages_kobj) | ||
| 1617 | return; /* no hstate attributes */ | ||
| 1618 | |||
| 1619 | for_each_hstate(h) | ||
| 1620 | if (nhs->hstate_kobjs[h - hstates]) { | ||
| 1621 | kobject_put(nhs->hstate_kobjs[h - hstates]); | ||
| 1622 | nhs->hstate_kobjs[h - hstates] = NULL; | ||
| 1623 | } | ||
| 1624 | |||
| 1625 | kobject_put(nhs->hugepages_kobj); | ||
| 1626 | nhs->hugepages_kobj = NULL; | ||
| 1627 | } | ||
| 1628 | |||
| 1629 | /* | ||
| 1630 | * hugetlb module exit: unregister hstate attributes from node sysdevs | ||
| 1631 | * that have them. | ||
| 1632 | */ | ||
| 1633 | static void hugetlb_unregister_all_nodes(void) | ||
| 1634 | { | ||
| 1635 | int nid; | ||
| 1636 | |||
| 1637 | /* | ||
| 1638 | * disable node sysdev registrations. | ||
| 1639 | */ | ||
| 1640 | register_hugetlbfs_with_node(NULL, NULL); | ||
| 1641 | |||
| 1642 | /* | ||
| 1643 | * remove hstate attributes from any nodes that have them. | ||
| 1644 | */ | ||
| 1645 | for (nid = 0; nid < nr_node_ids; nid++) | ||
| 1646 | hugetlb_unregister_node(&node_devices[nid]); | ||
| 1647 | } | ||
| 1648 | |||
| 1649 | /* | ||
| 1650 | * Register hstate attributes for a single node sysdev. | ||
| 1651 | * No-op if attributes already registered. | ||
| 1652 | */ | ||
| 1653 | void hugetlb_register_node(struct node *node) | ||
| 1654 | { | ||
| 1655 | struct hstate *h; | ||
| 1656 | struct node_hstate *nhs = &node_hstates[node->sysdev.id]; | ||
| 1657 | int err; | ||
| 1658 | |||
| 1659 | if (nhs->hugepages_kobj) | ||
| 1660 | return; /* already allocated */ | ||
| 1661 | |||
| 1662 | nhs->hugepages_kobj = kobject_create_and_add("hugepages", | ||
| 1663 | &node->sysdev.kobj); | ||
| 1664 | if (!nhs->hugepages_kobj) | ||
| 1665 | return; | ||
| 1666 | |||
| 1667 | for_each_hstate(h) { | ||
| 1668 | err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, | ||
| 1669 | nhs->hstate_kobjs, | ||
| 1670 | &per_node_hstate_attr_group); | ||
| 1671 | if (err) { | ||
| 1672 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s" | ||
| 1673 | " for node %d\n", | ||
| 1674 | h->name, node->sysdev.id); | ||
| 1675 | hugetlb_unregister_node(node); | ||
| 1676 | break; | ||
| 1677 | } | ||
| 1678 | } | ||
| 1679 | } | ||
| 1680 | |||
| 1681 | /* | ||
| 1682 | * hugetlb init time: register hstate attributes for all registered node | ||
| 1683 | * sysdevs of nodes that have memory. All on-line nodes should have | ||
| 1684 | * registered their associated sysdev by this time. | ||
| 1685 | */ | ||
| 1686 | static void hugetlb_register_all_nodes(void) | ||
| 1687 | { | ||
| 1688 | int nid; | ||
| 1689 | |||
| 1690 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
| 1691 | struct node *node = &node_devices[nid]; | ||
| 1692 | if (node->sysdev.id == nid) | ||
| 1693 | hugetlb_register_node(node); | ||
| 1694 | } | ||
| 1695 | |||
| 1696 | /* | ||
| 1697 | * Let the node sysdev driver know we're here so it can | ||
| 1698 | * [un]register hstate attributes on node hotplug. | ||
| 1699 | */ | ||
| 1700 | register_hugetlbfs_with_node(hugetlb_register_node, | ||
| 1701 | hugetlb_unregister_node); | ||
| 1702 | } | ||
| 1703 | #else /* !CONFIG_NUMA */ | ||
| 1704 | |||
| 1705 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | ||
| 1706 | { | ||
| 1707 | BUG(); | ||
| 1708 | if (nidp) | ||
| 1709 | *nidp = -1; | ||
| 1710 | return NULL; | ||
| 1711 | } | ||
| 1712 | |||
| 1713 | static void hugetlb_unregister_all_nodes(void) { } | ||
| 1714 | |||
| 1715 | static void hugetlb_register_all_nodes(void) { } | ||
| 1716 | |||
| 1717 | #endif | ||
| 1718 | |||
| 1414 | static void __exit hugetlb_exit(void) | 1719 | static void __exit hugetlb_exit(void) |
| 1415 | { | 1720 | { |
| 1416 | struct hstate *h; | 1721 | struct hstate *h; |
| 1417 | 1722 | ||
| 1723 | hugetlb_unregister_all_nodes(); | ||
| 1724 | |||
| 1418 | for_each_hstate(h) { | 1725 | for_each_hstate(h) { |
| 1419 | kobject_put(hstate_kobjs[h - hstates]); | 1726 | kobject_put(hstate_kobjs[h - hstates]); |
| 1420 | } | 1727 | } |
| @@ -1449,6 +1756,8 @@ static int __init hugetlb_init(void) | |||
| 1449 | 1756 | ||
| 1450 | hugetlb_sysfs_init(); | 1757 | hugetlb_sysfs_init(); |
| 1451 | 1758 | ||
| 1759 | hugetlb_register_all_nodes(); | ||
| 1760 | |||
| 1452 | return 0; | 1761 | return 0; |
| 1453 | } | 1762 | } |
| 1454 | module_init(hugetlb_init); | 1763 | module_init(hugetlb_init); |
| @@ -1472,8 +1781,8 @@ void __init hugetlb_add_hstate(unsigned order) | |||
| 1472 | h->free_huge_pages = 0; | 1781 | h->free_huge_pages = 0; |
| 1473 | for (i = 0; i < MAX_NUMNODES; ++i) | 1782 | for (i = 0; i < MAX_NUMNODES; ++i) |
| 1474 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1783 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
| 1475 | h->next_nid_to_alloc = first_node(node_online_map); | 1784 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); |
| 1476 | h->next_nid_to_free = first_node(node_online_map); | 1785 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); |
| 1477 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1786 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
| 1478 | huge_page_size(h)/1024); | 1787 | huge_page_size(h)/1024); |
| 1479 | 1788 | ||
| @@ -1536,9 +1845,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
| 1536 | } | 1845 | } |
| 1537 | 1846 | ||
| 1538 | #ifdef CONFIG_SYSCTL | 1847 | #ifdef CONFIG_SYSCTL |
| 1539 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1848 | static int hugetlb_sysctl_handler_common(bool obey_mempolicy, |
| 1540 | void __user *buffer, | 1849 | struct ctl_table *table, int write, |
| 1541 | size_t *length, loff_t *ppos) | 1850 | void __user *buffer, size_t *length, loff_t *ppos) |
| 1542 | { | 1851 | { |
| 1543 | struct hstate *h = &default_hstate; | 1852 | struct hstate *h = &default_hstate; |
| 1544 | unsigned long tmp; | 1853 | unsigned long tmp; |
| @@ -1550,12 +1859,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, | |||
| 1550 | table->maxlen = sizeof(unsigned long); | 1859 | table->maxlen = sizeof(unsigned long); |
| 1551 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1860 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
| 1552 | 1861 | ||
| 1553 | if (write) | 1862 | if (write) { |
| 1554 | h->max_huge_pages = set_max_huge_pages(h, tmp); | 1863 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, |
| 1864 | GFP_KERNEL | __GFP_NORETRY); | ||
| 1865 | if (!(obey_mempolicy && | ||
| 1866 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
| 1867 | NODEMASK_FREE(nodes_allowed); | ||
| 1868 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
| 1869 | } | ||
| 1870 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); | ||
| 1871 | |||
| 1872 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | ||
| 1873 | NODEMASK_FREE(nodes_allowed); | ||
| 1874 | } | ||
| 1555 | 1875 | ||
| 1556 | return 0; | 1876 | return 0; |
| 1557 | } | 1877 | } |
| 1558 | 1878 | ||
| 1879 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | ||
| 1880 | void __user *buffer, size_t *length, loff_t *ppos) | ||
| 1881 | { | ||
| 1882 | |||
| 1883 | return hugetlb_sysctl_handler_common(false, table, write, | ||
| 1884 | buffer, length, ppos); | ||
| 1885 | } | ||
| 1886 | |||
| 1887 | #ifdef CONFIG_NUMA | ||
| 1888 | int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, | ||
| 1889 | void __user *buffer, size_t *length, loff_t *ppos) | ||
| 1890 | { | ||
| 1891 | return hugetlb_sysctl_handler_common(true, table, write, | ||
| 1892 | buffer, length, ppos); | ||
| 1893 | } | ||
| 1894 | #endif /* CONFIG_NUMA */ | ||
| 1895 | |||
| 1559 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, | 1896 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, |
| 1560 | void __user *buffer, | 1897 | void __user *buffer, |
| 1561 | size_t *length, loff_t *ppos) | 1898 | size_t *length, loff_t *ppos) |
| @@ -1903,6 +2240,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1903 | + (vma->vm_pgoff >> PAGE_SHIFT); | 2240 | + (vma->vm_pgoff >> PAGE_SHIFT); |
| 1904 | mapping = (struct address_space *)page_private(page); | 2241 | mapping = (struct address_space *)page_private(page); |
| 1905 | 2242 | ||
| 2243 | /* | ||
| 2244 | * Take the mapping lock for the duration of the table walk. As | ||
| 2245 | * this mapping should be shared between all the VMAs, | ||
| 2246 | * __unmap_hugepage_range() is called as the lock is already held | ||
| 2247 | */ | ||
| 2248 | spin_lock(&mapping->i_mmap_lock); | ||
| 1906 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 2249 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
| 1907 | /* Do not unmap the current VMA */ | 2250 | /* Do not unmap the current VMA */ |
| 1908 | if (iter_vma == vma) | 2251 | if (iter_vma == vma) |
| @@ -1916,10 +2259,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1916 | * from the time of fork. This would look like data corruption | 2259 | * from the time of fork. This would look like data corruption |
| 1917 | */ | 2260 | */ |
| 1918 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | 2261 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) |
| 1919 | unmap_hugepage_range(iter_vma, | 2262 | __unmap_hugepage_range(iter_vma, |
| 1920 | address, address + huge_page_size(h), | 2263 | address, address + huge_page_size(h), |
| 1921 | page); | 2264 | page); |
| 1922 | } | 2265 | } |
| 2266 | spin_unlock(&mapping->i_mmap_lock); | ||
| 1923 | 2267 | ||
| 1924 | return 1; | 2268 | return 1; |
| 1925 | } | 2269 | } |
| @@ -1959,6 +2303,9 @@ retry_avoidcopy: | |||
| 1959 | outside_reserve = 1; | 2303 | outside_reserve = 1; |
| 1960 | 2304 | ||
| 1961 | page_cache_get(old_page); | 2305 | page_cache_get(old_page); |
| 2306 | |||
| 2307 | /* Drop page_table_lock as buddy allocator may be called */ | ||
| 2308 | spin_unlock(&mm->page_table_lock); | ||
| 1962 | new_page = alloc_huge_page(vma, address, outside_reserve); | 2309 | new_page = alloc_huge_page(vma, address, outside_reserve); |
| 1963 | 2310 | ||
| 1964 | if (IS_ERR(new_page)) { | 2311 | if (IS_ERR(new_page)) { |
| @@ -1976,19 +2323,25 @@ retry_avoidcopy: | |||
| 1976 | if (unmap_ref_private(mm, vma, old_page, address)) { | 2323 | if (unmap_ref_private(mm, vma, old_page, address)) { |
| 1977 | BUG_ON(page_count(old_page) != 1); | 2324 | BUG_ON(page_count(old_page) != 1); |
| 1978 | BUG_ON(huge_pte_none(pte)); | 2325 | BUG_ON(huge_pte_none(pte)); |
| 2326 | spin_lock(&mm->page_table_lock); | ||
| 1979 | goto retry_avoidcopy; | 2327 | goto retry_avoidcopy; |
| 1980 | } | 2328 | } |
| 1981 | WARN_ON_ONCE(1); | 2329 | WARN_ON_ONCE(1); |
| 1982 | } | 2330 | } |
| 1983 | 2331 | ||
| 2332 | /* Caller expects lock to be held */ | ||
| 2333 | spin_lock(&mm->page_table_lock); | ||
| 1984 | return -PTR_ERR(new_page); | 2334 | return -PTR_ERR(new_page); |
| 1985 | } | 2335 | } |
| 1986 | 2336 | ||
| 1987 | spin_unlock(&mm->page_table_lock); | ||
| 1988 | copy_huge_page(new_page, old_page, address, vma); | 2337 | copy_huge_page(new_page, old_page, address, vma); |
| 1989 | __SetPageUptodate(new_page); | 2338 | __SetPageUptodate(new_page); |
| 1990 | spin_lock(&mm->page_table_lock); | ||
| 1991 | 2339 | ||
| 2340 | /* | ||
| 2341 | * Retake the page_table_lock to check for racing updates | ||
| 2342 | * before the page tables are altered | ||
| 2343 | */ | ||
| 2344 | spin_lock(&mm->page_table_lock); | ||
| 1992 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2345 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
| 1993 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 2346 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
| 1994 | /* Break COW */ | 2347 | /* Break COW */ |
