diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 551 |
1 files changed, 452 insertions, 99 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d7601b02874..65f38c218207 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <asm/io.h> | 24 | #include <asm/io.h> |
25 | 25 | ||
26 | #include <linux/hugetlb.h> | 26 | #include <linux/hugetlb.h> |
27 | #include <linux/node.h> | ||
27 | #include "internal.h" | 28 | #include "internal.h" |
28 | 29 | ||
29 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 30 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
@@ -622,42 +623,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
622 | } | 623 | } |
623 | 624 | ||
624 | /* | 625 | /* |
625 | * Use a helper variable to find the next node and then | 626 | * common helper functions for hstate_next_node_to_{alloc|free}. |
626 | * copy it back to next_nid_to_alloc afterwards: | 627 | * We may have allocated or freed a huge page based on a different |
627 | * otherwise there's a window in which a racer might | 628 | * nodes_allowed previously, so h->next_node_to_{alloc|free} might |
628 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. | 629 | * be outside of *nodes_allowed. Ensure that we use an allowed |
629 | * But we don't need to use a spin_lock here: it really | 630 | * node for alloc or free. |
630 | * doesn't matter if occasionally a racer chooses the | ||
631 | * same nid as we do. Move nid forward in the mask even | ||
632 | * if we just successfully allocated a hugepage so that | ||
633 | * the next caller gets hugepages on the next node. | ||
634 | */ | 631 | */ |
635 | static int hstate_next_node_to_alloc(struct hstate *h) | 632 | static int next_node_allowed(int nid, nodemask_t *nodes_allowed) |
636 | { | 633 | { |
637 | int next_nid; | 634 | nid = next_node(nid, *nodes_allowed); |
638 | next_nid = next_node(h->next_nid_to_alloc, node_online_map); | 635 | if (nid == MAX_NUMNODES) |
639 | if (next_nid == MAX_NUMNODES) | 636 | nid = first_node(*nodes_allowed); |
640 | next_nid = first_node(node_online_map); | 637 | VM_BUG_ON(nid >= MAX_NUMNODES); |
641 | h->next_nid_to_alloc = next_nid; | 638 | |
642 | return next_nid; | 639 | return nid; |
640 | } | ||
641 | |||
642 | static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) | ||
643 | { | ||
644 | if (!node_isset(nid, *nodes_allowed)) | ||
645 | nid = next_node_allowed(nid, nodes_allowed); | ||
646 | return nid; | ||
647 | } | ||
648 | |||
649 | /* | ||
650 | * returns the previously saved node ["this node"] from which to | ||
651 | * allocate a persistent huge page for the pool and advance the | ||
652 | * next node from which to allocate, handling wrap at end of node | ||
653 | * mask. | ||
654 | */ | ||
655 | static int hstate_next_node_to_alloc(struct hstate *h, | ||
656 | nodemask_t *nodes_allowed) | ||
657 | { | ||
658 | int nid; | ||
659 | |||
660 | VM_BUG_ON(!nodes_allowed); | ||
661 | |||
662 | nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); | ||
663 | h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); | ||
664 | |||
665 | return nid; | ||
643 | } | 666 | } |
644 | 667 | ||
645 | static int alloc_fresh_huge_page(struct hstate *h) | 668 | static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) |
646 | { | 669 | { |
647 | struct page *page; | 670 | struct page *page; |
648 | int start_nid; | 671 | int start_nid; |
649 | int next_nid; | 672 | int next_nid; |
650 | int ret = 0; | 673 | int ret = 0; |
651 | 674 | ||
652 | start_nid = h->next_nid_to_alloc; | 675 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
653 | next_nid = start_nid; | 676 | next_nid = start_nid; |
654 | 677 | ||
655 | do { | 678 | do { |
656 | page = alloc_fresh_huge_page_node(h, next_nid); | 679 | page = alloc_fresh_huge_page_node(h, next_nid); |
657 | if (page) | 680 | if (page) { |
658 | ret = 1; | 681 | ret = 1; |
659 | next_nid = hstate_next_node_to_alloc(h); | 682 | break; |
660 | } while (!page && next_nid != start_nid); | 683 | } |
684 | next_nid = hstate_next_node_to_alloc(h, nodes_allowed); | ||
685 | } while (next_nid != start_nid); | ||
661 | 686 | ||
662 | if (ret) | 687 | if (ret) |
663 | count_vm_event(HTLB_BUDDY_PGALLOC); | 688 | count_vm_event(HTLB_BUDDY_PGALLOC); |
@@ -668,17 +693,21 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
668 | } | 693 | } |
669 | 694 | ||
670 | /* | 695 | /* |
671 | * helper for free_pool_huge_page() - find next node | 696 | * helper for free_pool_huge_page() - return the previously saved |
672 | * from which to free a huge page | 697 | * node ["this node"] from which to free a huge page. Advance the |
698 | * next node id whether or not we find a free huge page to free so | ||
699 | * that the next attempt to free addresses the next node. | ||
673 | */ | 700 | */ |
674 | static int hstate_next_node_to_free(struct hstate *h) | 701 | static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) |
675 | { | 702 | { |
676 | int next_nid; | 703 | int nid; |
677 | next_nid = next_node(h->next_nid_to_free, node_online_map); | 704 | |
678 | if (next_nid == MAX_NUMNODES) | 705 | VM_BUG_ON(!nodes_allowed); |
679 | next_nid = first_node(node_online_map); | 706 | |
680 | h->next_nid_to_free = next_nid; | 707 | nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); |
681 | return next_nid; | 708 | h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); |
709 | |||
710 | return nid; | ||
682 | } | 711 | } |
683 | 712 | ||
684 | /* | 713 | /* |
@@ -687,13 +716,14 @@ static int hstate_next_node_to_free(struct hstate *h) | |||
687 | * balanced over allowed nodes. | 716 | * balanced over allowed nodes. |
688 | * Called with hugetlb_lock locked. | 717 | * Called with hugetlb_lock locked. |
689 | */ | 718 | */ |
690 | static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | 719 | static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, |
720 | bool acct_surplus) | ||
691 | { | 721 | { |
692 | int start_nid; | 722 | int start_nid; |
693 | int next_nid; | 723 | int next_nid; |
694 | int ret = 0; | 724 | int ret = 0; |
695 | 725 | ||
696 | start_nid = h->next_nid_to_free; | 726 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
697 | next_nid = start_nid; | 727 | next_nid = start_nid; |
698 | 728 | ||
699 | do { | 729 | do { |
@@ -715,9 +745,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | |||
715 | } | 745 | } |
716 | update_and_free_page(h, page); | 746 | update_and_free_page(h, page); |
717 | ret = 1; | 747 | ret = 1; |
748 | break; | ||
718 | } | 749 | } |
719 | next_nid = hstate_next_node_to_free(h); | 750 | next_nid = hstate_next_node_to_free(h, nodes_allowed); |
720 | } while (!ret && next_nid != start_nid); | 751 | } while (next_nid != start_nid); |
721 | 752 | ||
722 | return ret; | 753 | return ret; |
723 | } | 754 | } |
@@ -911,14 +942,14 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
911 | 942 | ||
912 | /* | 943 | /* |
913 | * We want to release as many surplus pages as possible, spread | 944 | * We want to release as many surplus pages as possible, spread |
914 | * evenly across all nodes. Iterate across all nodes until we | 945 | * evenly across all nodes with memory. Iterate across these nodes |
915 | * can no longer free unreserved surplus pages. This occurs when | 946 | * until we can no longer free unreserved surplus pages. This occurs |
916 | * the nodes with surplus pages have no free pages. | 947 | * when the nodes with surplus pages have no free pages. |
917 | * free_pool_huge_page() will balance the the frees across the | 948 | * free_pool_huge_page() will balance the the freed pages across the |
918 | * on-line nodes for us and will handle the hstate accounting. | 949 | * on-line nodes with memory and will handle the hstate accounting. |
919 | */ | 950 | */ |
920 | while (nr_pages--) { | 951 | while (nr_pages--) { |
921 | if (!free_pool_huge_page(h, 1)) | 952 | if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) |
922 | break; | 953 | break; |
923 | } | 954 | } |
924 | } | 955 | } |
@@ -1022,16 +1053,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1022 | int __weak alloc_bootmem_huge_page(struct hstate *h) | 1053 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
1023 | { | 1054 | { |
1024 | struct huge_bootmem_page *m; | 1055 | struct huge_bootmem_page *m; |
1025 | int nr_nodes = nodes_weight(node_online_map); | 1056 | int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); |
1026 | 1057 | ||
1027 | while (nr_nodes) { | 1058 | while (nr_nodes) { |
1028 | void *addr; | 1059 | void *addr; |
1029 | 1060 | ||
1030 | addr = __alloc_bootmem_node_nopanic( | 1061 | addr = __alloc_bootmem_node_nopanic( |
1031 | NODE_DATA(h->next_nid_to_alloc), | 1062 | NODE_DATA(hstate_next_node_to_alloc(h, |
1063 | &node_states[N_HIGH_MEMORY])), | ||
1032 | huge_page_size(h), huge_page_size(h), 0); | 1064 | huge_page_size(h), huge_page_size(h), 0); |
1033 | 1065 | ||
1034 | hstate_next_node_to_alloc(h); | ||
1035 | if (addr) { | 1066 | if (addr) { |
1036 | /* | 1067 | /* |
1037 | * Use the beginning of the huge page to store the | 1068 | * Use the beginning of the huge page to store the |
@@ -1084,7 +1115,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
1084 | if (h->order >= MAX_ORDER) { | 1115 | if (h->order >= MAX_ORDER) { |
1085 | if (!alloc_bootmem_huge_page(h)) | 1116 | if (!alloc_bootmem_huge_page(h)) |
1086 | break; | 1117 | break; |
1087 | } else if (!alloc_fresh_huge_page(h)) | 1118 | } else if (!alloc_fresh_huge_page(h, |
1119 | &node_states[N_HIGH_MEMORY])) | ||
1088 | break; | 1120 | break; |
1089 | } | 1121 | } |
1090 | h->max_huge_pages = i; | 1122 | h->max_huge_pages = i; |
@@ -1126,14 +1158,15 @@ static void __init report_hugepages(void) | |||
1126 | } | 1158 | } |
1127 | 1159 | ||
1128 | #ifdef CONFIG_HIGHMEM | 1160 | #ifdef CONFIG_HIGHMEM |
1129 | static void try_to_free_low(struct hstate *h, unsigned long count) | 1161 | static void try_to_free_low(struct hstate *h, unsigned long count, |
1162 | nodemask_t *nodes_allowed) | ||
1130 | { | 1163 | { |
1131 | int i; | 1164 | int i; |
1132 | 1165 | ||
1133 | if (h->order >= MAX_ORDER) | 1166 | if (h->order >= MAX_ORDER) |
1134 | return; | 1167 | return; |
1135 | 1168 | ||
1136 | for (i = 0; i < MAX_NUMNODES; ++i) { | 1169 | for_each_node_mask(i, *nodes_allowed) { |
1137 | struct page *page, *next; | 1170 | struct page *page, *next; |
1138 | struct list_head *freel = &h->hugepage_freelists[i]; | 1171 | struct list_head *freel = &h->hugepage_freelists[i]; |
1139 | list_for_each_entry_safe(page, next, freel, lru) { | 1172 | list_for_each_entry_safe(page, next, freel, lru) { |
@@ -1149,7 +1182,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count) | |||
1149 | } | 1182 | } |
1150 | } | 1183 | } |
1151 | #else | 1184 | #else |
1152 | static inline void try_to_free_low(struct hstate *h, unsigned long count) | 1185 | static inline void try_to_free_low(struct hstate *h, unsigned long count, |
1186 | nodemask_t *nodes_allowed) | ||
1153 | { | 1187 | { |
1154 | } | 1188 | } |
1155 | #endif | 1189 | #endif |
@@ -1159,7 +1193,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
1159 | * balanced by operating on them in a round-robin fashion. | 1193 | * balanced by operating on them in a round-robin fashion. |
1160 | * Returns 1 if an adjustment was made. | 1194 | * Returns 1 if an adjustment was made. |
1161 | */ | 1195 | */ |
1162 | static int adjust_pool_surplus(struct hstate *h, int delta) | 1196 | static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, |
1197 | int delta) | ||
1163 | { | 1198 | { |
1164 | int start_nid, next_nid; | 1199 | int start_nid, next_nid; |
1165 | int ret = 0; | 1200 | int ret = 0; |
@@ -1167,29 +1202,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
1167 | VM_BUG_ON(delta != -1 && delta != 1); | 1202 | VM_BUG_ON(delta != -1 && delta != 1); |
1168 | 1203 | ||
1169 | if (delta < 0) | 1204 | if (delta < 0) |
1170 | start_nid = h->next_nid_to_alloc; | 1205 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
1171 | else | 1206 | else |
1172 | start_nid = h->next_nid_to_free; | 1207 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
1173 | next_nid = start_nid; | 1208 | next_nid = start_nid; |
1174 | 1209 | ||
1175 | do { | 1210 | do { |
1176 | int nid = next_nid; | 1211 | int nid = next_nid; |
1177 | if (delta < 0) { | 1212 | if (delta < 0) { |
1178 | next_nid = hstate_next_node_to_alloc(h); | ||
1179 | /* | 1213 | /* |
1180 | * To shrink on this node, there must be a surplus page | 1214 | * To shrink on this node, there must be a surplus page |
1181 | */ | 1215 | */ |
1182 | if (!h->surplus_huge_pages_node[nid]) | 1216 | if (!h->surplus_huge_pages_node[nid]) { |
1217 | next_nid = hstate_next_node_to_alloc(h, | ||
1218 | nodes_allowed); | ||
1183 | continue; | 1219 | continue; |
1220 | } | ||
1184 | } | 1221 | } |
1185 | if (delta > 0) { | 1222 | if (delta > 0) { |
1186 | next_nid = hstate_next_node_to_free(h); | ||
1187 | /* | 1223 | /* |
1188 | * Surplus cannot exceed the total number of pages | 1224 | * Surplus cannot exceed the total number of pages |
1189 | */ | 1225 | */ |
1190 | if (h->surplus_huge_pages_node[nid] >= | 1226 | if (h->surplus_huge_pages_node[nid] >= |
1191 | h->nr_huge_pages_node[nid]) | 1227 | h->nr_huge_pages_node[nid]) { |
1228 | next_nid = hstate_next_node_to_free(h, | ||
1229 | nodes_allowed); | ||
1192 | continue; | 1230 | continue; |
1231 | } | ||
1193 | } | 1232 | } |
1194 | 1233 | ||
1195 | h->surplus_huge_pages += delta; | 1234 | h->surplus_huge_pages += delta; |
@@ -1202,7 +1241,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
1202 | } | 1241 | } |
1203 | 1242 | ||
1204 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) | 1243 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
1205 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | 1244 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, |
1245 | nodemask_t *nodes_allowed) | ||
1206 | { | 1246 | { |
1207 | unsigned long min_count, ret; | 1247 | unsigned long min_count, ret; |
1208 | 1248 | ||
@@ -1222,7 +1262,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1222 | */ | 1262 | */ |
1223 | spin_lock(&hugetlb_lock); | 1263 | spin_lock(&hugetlb_lock); |
1224 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { | 1264 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { |
1225 | if (!adjust_pool_surplus(h, -1)) | 1265 | if (!adjust_pool_surplus(h, nodes_allowed, -1)) |
1226 | break; | 1266 | break; |
1227 | } | 1267 | } |
1228 | 1268 | ||
@@ -1233,11 +1273,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1233 | * and reducing the surplus. | 1273 | * and reducing the surplus. |
1234 | */ | 1274 | */ |
1235 | spin_unlock(&hugetlb_lock); | 1275 | spin_unlock(&hugetlb_lock); |
1236 | ret = alloc_fresh_huge_page(h); | 1276 | ret = alloc_fresh_huge_page(h, nodes_allowed); |
1237 | spin_lock(&hugetlb_lock); | 1277 | spin_lock(&hugetlb_lock); |
1238 | if (!ret) | 1278 | if (!ret) |
1239 | goto out; | 1279 | goto out; |
1240 | 1280 | ||
1281 | /* Bail for signals. Probably ctrl-c from user */ | ||
1282 | if (signal_pending(current)) | ||
1283 | goto out; | ||
1241 | } | 1284 | } |
1242 | 1285 | ||
1243 | /* | 1286 | /* |
@@ -1257,13 +1300,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1257 | */ | 1300 | */ |
1258 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; | 1301 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; |
1259 | min_count = max(count, min_count); | 1302 | min_count = max(count, min_count); |
1260 | try_to_free_low(h, min_count); | 1303 | try_to_free_low(h, min_count, nodes_allowed); |
1261 | while (min_count < persistent_huge_pages(h)) { | 1304 | while (min_count < persistent_huge_pages(h)) { |
1262 | if (!free_pool_huge_page(h, 0)) | 1305 | if (!free_pool_huge_page(h, nodes_allowed, 0)) |
1263 | break; | 1306 | break; |
1264 | } | 1307 | } |
1265 | while (count < persistent_huge_pages(h)) { | 1308 | while (count < persistent_huge_pages(h)) { |
1266 | if (!adjust_pool_surplus(h, 1)) | 1309 | if (!adjust_pool_surplus(h, nodes_allowed, 1)) |
1267 | break; | 1310 | break; |
1268 | } | 1311 | } |
1269 | out: | 1312 | out: |
@@ -1282,43 +1325,117 @@ out: | |||
1282 | static struct kobject *hugepages_kobj; | 1325 | static struct kobject *hugepages_kobj; |
1283 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | 1326 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; |
1284 | 1327 | ||
1285 | static struct hstate *kobj_to_hstate(struct kobject *kobj) | 1328 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); |
1329 | |||
1330 | static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) | ||
1286 | { | 1331 | { |
1287 | int i; | 1332 | int i; |
1333 | |||
1288 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | 1334 | for (i = 0; i < HUGE_MAX_HSTATE; i++) |
1289 | if (hstate_kobjs[i] == kobj) | 1335 | if (hstate_kobjs[i] == kobj) { |
1336 | if (nidp) | ||
1337 | *nidp = NUMA_NO_NODE; | ||
1290 | return &hstates[i]; | 1338 | return &hstates[i]; |
1291 | BUG(); | 1339 | } |
1292 | return NULL; | 1340 | |
1341 | return kobj_to_node_hstate(kobj, nidp); | ||
1293 | } | 1342 | } |
1294 | 1343 | ||
1295 | static ssize_t nr_hugepages_show(struct kobject *kobj, | 1344 | static ssize_t nr_hugepages_show_common(struct kobject *kobj, |
1296 | struct kobj_attribute *attr, char *buf) | 1345 | struct kobj_attribute *attr, char *buf) |
1297 | { | 1346 | { |
1298 | struct hstate *h = kobj_to_hstate(kobj); | 1347 | struct hstate *h; |
1299 | return sprintf(buf, "%lu\n", h->nr_huge_pages); | 1348 | unsigned long nr_huge_pages; |
1349 | int nid; | ||
1350 | |||
1351 | h = kobj_to_hstate(kobj, &nid); | ||
1352 | if (nid == NUMA_NO_NODE) | ||
1353 | nr_huge_pages = h->nr_huge_pages; | ||
1354 | else | ||
1355 | nr_huge_pages = h->nr_huge_pages_node[nid]; | ||
1356 | |||
1357 | return sprintf(buf, "%lu\n", nr_huge_pages); | ||
1300 | } | 1358 | } |
1301 | static ssize_t nr_hugepages_store(struct kobject *kobj, | 1359 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, |
1302 | struct kobj_attribute *attr, const char *buf, size_t count) | 1360 | struct kobject *kobj, struct kobj_attribute *attr, |
1361 | const char *buf, size_t len) | ||
1303 | { | 1362 | { |
1304 | int err; | 1363 | int err; |
1305 | unsigned long input; | 1364 | int nid; |
1306 | struct hstate *h = kobj_to_hstate(kobj); | 1365 | unsigned long count; |
1366 | struct hstate *h; | ||
1367 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); | ||
1307 | 1368 | ||
1308 | err = strict_strtoul(buf, 10, &input); | 1369 | err = strict_strtoul(buf, 10, &count); |
1309 | if (err) | 1370 | if (err) |
1310 | return 0; | 1371 | return 0; |
1311 | 1372 | ||
1312 | h->max_huge_pages = set_max_huge_pages(h, input); | 1373 | h = kobj_to_hstate(kobj, &nid); |
1374 | if (nid == NUMA_NO_NODE) { | ||
1375 | /* | ||
1376 | * global hstate attribute | ||
1377 | */ | ||
1378 | if (!(obey_mempolicy && | ||
1379 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
1380 | NODEMASK_FREE(nodes_allowed); | ||
1381 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1382 | } | ||
1383 | } else if (nodes_allowed) { | ||
1384 | /* | ||
1385 | * per node hstate attribute: adjust count to global, | ||
1386 | * but restrict alloc/free to the specified node. | ||
1387 | */ | ||
1388 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; | ||
1389 | init_nodemask_of_node(nodes_allowed, nid); | ||
1390 | } else | ||
1391 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1392 | |||
1393 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); | ||
1313 | 1394 | ||
1314 | return count; | 1395 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) |
1396 | NODEMASK_FREE(nodes_allowed); | ||
1397 | |||
1398 | return len; | ||
1399 | } | ||
1400 | |||
1401 | static ssize_t nr_hugepages_show(struct kobject *kobj, | ||
1402 | struct kobj_attribute *attr, char *buf) | ||
1403 | { | ||
1404 | return nr_hugepages_show_common(kobj, attr, buf); | ||
1405 | } | ||
1406 | |||
1407 | static ssize_t nr_hugepages_store(struct kobject *kobj, | ||
1408 | struct kobj_attribute *attr, const char *buf, size_t len) | ||
1409 | { | ||
1410 | return nr_hugepages_store_common(false, kobj, attr, buf, len); | ||
1315 | } | 1411 | } |
1316 | HSTATE_ATTR(nr_hugepages); | 1412 | HSTATE_ATTR(nr_hugepages); |
1317 | 1413 | ||
1414 | #ifdef CONFIG_NUMA | ||
1415 | |||
1416 | /* | ||
1417 | * hstate attribute for optionally mempolicy-based constraint on persistent | ||
1418 | * huge page alloc/free. | ||
1419 | */ | ||
1420 | static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, | ||
1421 | struct kobj_attribute *attr, char *buf) | ||
1422 | { | ||
1423 | return nr_hugepages_show_common(kobj, attr, buf); | ||
1424 | } | ||
1425 | |||
1426 | static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, | ||
1427 | struct kobj_attribute *attr, const char *buf, size_t len) | ||
1428 | { | ||
1429 | return nr_hugepages_store_common(true, kobj, attr, buf, len); | ||
1430 | } | ||
1431 | HSTATE_ATTR(nr_hugepages_mempolicy); | ||
1432 | #endif | ||
1433 | |||
1434 | |||
1318 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, | 1435 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, |
1319 | struct kobj_attribute *attr, char *buf) | 1436 | struct kobj_attribute *attr, char *buf) |
1320 | { | 1437 | { |
1321 | struct hstate *h = kobj_to_hstate(kobj); | 1438 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1322 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); | 1439 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); |
1323 | } | 1440 | } |
1324 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | 1441 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, |
@@ -1326,7 +1443,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | |||
1326 | { | 1443 | { |
1327 | int err; | 1444 | int err; |
1328 | unsigned long input; | 1445 | unsigned long input; |
1329 | struct hstate *h = kobj_to_hstate(kobj); | 1446 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1330 | 1447 | ||
1331 | err = strict_strtoul(buf, 10, &input); | 1448 | err = strict_strtoul(buf, 10, &input); |
1332 | if (err) | 1449 | if (err) |
@@ -1343,15 +1460,24 @@ HSTATE_ATTR(nr_overcommit_hugepages); | |||
1343 | static ssize_t free_hugepages_show(struct kobject *kobj, | 1460 | static ssize_t free_hugepages_show(struct kobject *kobj, |
1344 | struct kobj_attribute *attr, char *buf) | 1461 | struct kobj_attribute *attr, char *buf) |
1345 | { | 1462 | { |
1346 | struct hstate *h = kobj_to_hstate(kobj); | 1463 | struct hstate *h; |
1347 | return sprintf(buf, "%lu\n", h->free_huge_pages); | 1464 | unsigned long free_huge_pages; |
1465 | int nid; | ||
1466 | |||
1467 | h = kobj_to_hstate(kobj, &nid); | ||
1468 | if (nid == NUMA_NO_NODE) | ||
1469 | free_huge_pages = h->free_huge_pages; | ||
1470 | else | ||
1471 | free_huge_pages = h->free_huge_pages_node[nid]; | ||
1472 | |||
1473 | return sprintf(buf, "%lu\n", free_huge_pages); | ||
1348 | } | 1474 | } |
1349 | HSTATE_ATTR_RO(free_hugepages); | 1475 | HSTATE_ATTR_RO(free_hugepages); |
1350 | 1476 | ||
1351 | static ssize_t resv_hugepages_show(struct kobject *kobj, | 1477 | static ssize_t resv_hugepages_show(struct kobject *kobj, |
1352 | struct kobj_attribute *attr, char *buf) | 1478 | struct kobj_attribute *attr, char *buf) |
1353 | { | 1479 | { |
1354 | struct hstate *h = kobj_to_hstate(kobj); | 1480 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1355 | return sprintf(buf, "%lu\n", h->resv_huge_pages); | 1481 | return sprintf(buf, "%lu\n", h->resv_huge_pages); |
1356 | } | 1482 | } |
1357 | HSTATE_ATTR_RO(resv_hugepages); | 1483 | HSTATE_ATTR_RO(resv_hugepages); |
@@ -1359,8 +1485,17 @@ HSTATE_ATTR_RO(resv_hugepages); | |||
1359 | static ssize_t surplus_hugepages_show(struct kobject *kobj, | 1485 | static ssize_t surplus_hugepages_show(struct kobject *kobj, |
1360 | struct kobj_attribute *attr, char *buf) | 1486 | struct kobj_attribute *attr, char *buf) |
1361 | { | 1487 | { |
1362 | struct hstate *h = kobj_to_hstate(kobj); | 1488 | struct hstate *h; |
1363 | return sprintf(buf, "%lu\n", h->surplus_huge_pages); | 1489 | unsigned long surplus_huge_pages; |
1490 | int nid; | ||
1491 | |||
1492 | h = kobj_to_hstate(kobj, &nid); | ||
1493 | if (nid == NUMA_NO_NODE) | ||
1494 | surplus_huge_pages = h->surplus_huge_pages; | ||
1495 | else | ||
1496 | surplus_huge_pages = h->surplus_huge_pages_node[nid]; | ||
1497 | |||
1498 | return sprintf(buf, "%lu\n", surplus_huge_pages); | ||
1364 | } | 1499 | } |
1365 | HSTATE_ATTR_RO(surplus_hugepages); | 1500 | HSTATE_ATTR_RO(surplus_hugepages); |
1366 | 1501 | ||
@@ -1370,6 +1505,9 @@ static struct attribute *hstate_attrs[] = { | |||
1370 | &free_hugepages_attr.attr, | 1505 | &free_hugepages_attr.attr, |
1371 | &resv_hugepages_attr.attr, | 1506 | &resv_hugepages_attr.attr, |
1372 | &surplus_hugepages_attr.attr, | 1507 | &surplus_hugepages_attr.attr, |
1508 | #ifdef CONFIG_NUMA | ||
1509 | &nr_hugepages_mempolicy_attr.attr, | ||
1510 | #endif | ||
1373 | NULL, | 1511 | NULL, |
1374 | }; | 1512 | }; |
1375 | 1513 | ||
@@ -1377,19 +1515,21 @@ static struct attribute_group hstate_attr_group = { | |||
1377 | .attrs = hstate_attrs, | 1515 | .attrs = hstate_attrs, |
1378 | }; | 1516 | }; |
1379 | 1517 | ||
1380 | static int __init hugetlb_sysfs_add_hstate(struct hstate *h) | 1518 | static int __init hugetlb_sysfs_add_hstate(struct hstate *h, |
1519 | struct kobject *parent, | ||
1520 | struct kobject **hstate_kobjs, | ||
1521 | struct attribute_group *hstate_attr_group) | ||
1381 | { | 1522 | { |
1382 | int retval; | 1523 | int retval; |
1524 | int hi = h - hstates; | ||
1383 | 1525 | ||
1384 | hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, | 1526 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); |
1385 | hugepages_kobj); | 1527 | if (!hstate_kobjs[hi]) |
1386 | if (!hstate_kobjs[h - hstates]) | ||
1387 | return -ENOMEM; | 1528 | return -ENOMEM; |
1388 | 1529 | ||
1389 | retval = sysfs_create_group(hstate_kobjs[h - hstates], | 1530 | retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); |
1390 | &hstate_attr_group); | ||
1391 | if (retval) | 1531 | if (retval) |
1392 | kobject_put(hstate_kobjs[h - hstates]); | 1532 | kobject_put(hstate_kobjs[hi]); |
1393 | 1533 | ||
1394 | return retval; | 1534 | return retval; |
1395 | } | 1535 | } |
@@ -1404,17 +1544,184 @@ static void __init hugetlb_sysfs_init(void) | |||
1404 | return; | 1544 | return; |
1405 | 1545 | ||
1406 | for_each_hstate(h) { | 1546 | for_each_hstate(h) { |
1407 | err = hugetlb_sysfs_add_hstate(h); | 1547 | err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, |
1548 | hstate_kobjs, &hstate_attr_group); | ||
1408 | if (err) | 1549 | if (err) |
1409 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", | 1550 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", |
1410 | h->name); | 1551 | h->name); |
1411 | } | 1552 | } |
1412 | } | 1553 | } |
1413 | 1554 | ||
1555 | #ifdef CONFIG_NUMA | ||
1556 | |||
1557 | /* | ||
1558 | * node_hstate/s - associate per node hstate attributes, via their kobjects, | ||
1559 | * with node sysdevs in node_devices[] using a parallel array. The array | ||
1560 | * index of a node sysdev or _hstate == node id. | ||
1561 | * This is here to avoid any static dependency of the node sysdev driver, in | ||
1562 | * the base kernel, on the hugetlb module. | ||
1563 | */ | ||
1564 | struct node_hstate { | ||
1565 | struct kobject *hugepages_kobj; | ||
1566 | struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | ||
1567 | }; | ||
1568 | struct node_hstate node_hstates[MAX_NUMNODES]; | ||
1569 | |||
1570 | /* | ||
1571 | * A subset of global hstate attributes for node sysdevs | ||
1572 | */ | ||
1573 | static struct attribute *per_node_hstate_attrs[] = { | ||
1574 | &nr_hugepages_attr.attr, | ||
1575 | &free_hugepages_attr.attr, | ||
1576 | &surplus_hugepages_attr.attr, | ||
1577 | NULL, | ||
1578 | }; | ||
1579 | |||
1580 | static struct attribute_group per_node_hstate_attr_group = { | ||
1581 | .attrs = per_node_hstate_attrs, | ||
1582 | }; | ||
1583 | |||
1584 | /* | ||
1585 | * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. | ||
1586 | * Returns node id via non-NULL nidp. | ||
1587 | */ | ||
1588 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | ||
1589 | { | ||
1590 | int nid; | ||
1591 | |||
1592 | for (nid = 0; nid < nr_node_ids; nid++) { | ||
1593 | struct node_hstate *nhs = &node_hstates[nid]; | ||
1594 | int i; | ||
1595 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | ||
1596 | if (nhs->hstate_kobjs[i] == kobj) { | ||
1597 | if (nidp) | ||
1598 | *nidp = nid; | ||
1599 | return &hstates[i]; | ||
1600 | } | ||
1601 | } | ||
1602 | |||
1603 | BUG(); | ||
1604 | return NULL; | ||
1605 | } | ||
1606 | |||
1607 | /* | ||
1608 | * Unregister hstate attributes from a single node sysdev. | ||
1609 | * No-op if no hstate attributes attached. | ||
1610 | */ | ||
1611 | void hugetlb_unregister_node(struct node *node) | ||
1612 | { | ||
1613 | struct hstate *h; | ||
1614 | struct node_hstate *nhs = &node_hstates[node->sysdev.id]; | ||
1615 | |||
1616 | if (!nhs->hugepages_kobj) | ||
1617 | return; /* no hstate attributes */ | ||
1618 | |||
1619 | for_each_hstate(h) | ||
1620 | if (nhs->hstate_kobjs[h - hstates]) { | ||
1621 | kobject_put(nhs->hstate_kobjs[h - hstates]); | ||
1622 | nhs->hstate_kobjs[h - hstates] = NULL; | ||
1623 | } | ||
1624 | |||
1625 | kobject_put(nhs->hugepages_kobj); | ||
1626 | nhs->hugepages_kobj = NULL; | ||
1627 | } | ||
1628 | |||
1629 | /* | ||
1630 | * hugetlb module exit: unregister hstate attributes from node sysdevs | ||
1631 | * that have them. | ||
1632 | */ | ||
1633 | static void hugetlb_unregister_all_nodes(void) | ||
1634 | { | ||
1635 | int nid; | ||
1636 | |||
1637 | /* | ||
1638 | * disable node sysdev registrations. | ||
1639 | */ | ||
1640 | register_hugetlbfs_with_node(NULL, NULL); | ||
1641 | |||
1642 | /* | ||
1643 | * remove hstate attributes from any nodes that have them. | ||
1644 | */ | ||
1645 | for (nid = 0; nid < nr_node_ids; nid++) | ||
1646 | hugetlb_unregister_node(&node_devices[nid]); | ||
1647 | } | ||
1648 | |||
1649 | /* | ||
1650 | * Register hstate attributes for a single node sysdev. | ||
1651 | * No-op if attributes already registered. | ||
1652 | */ | ||
1653 | void hugetlb_register_node(struct node *node) | ||
1654 | { | ||
1655 | struct hstate *h; | ||
1656 | struct node_hstate *nhs = &node_hstates[node->sysdev.id]; | ||
1657 | int err; | ||
1658 | |||
1659 | if (nhs->hugepages_kobj) | ||
1660 | return; /* already allocated */ | ||
1661 | |||
1662 | nhs->hugepages_kobj = kobject_create_and_add("hugepages", | ||
1663 | &node->sysdev.kobj); | ||
1664 | if (!nhs->hugepages_kobj) | ||
1665 | return; | ||
1666 | |||
1667 | for_each_hstate(h) { | ||
1668 | err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, | ||
1669 | nhs->hstate_kobjs, | ||
1670 | &per_node_hstate_attr_group); | ||
1671 | if (err) { | ||
1672 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s" | ||
1673 | " for node %d\n", | ||
1674 | h->name, node->sysdev.id); | ||
1675 | hugetlb_unregister_node(node); | ||
1676 | break; | ||
1677 | } | ||
1678 | } | ||
1679 | } | ||
1680 | |||
1681 | /* | ||
1682 | * hugetlb init time: register hstate attributes for all registered node | ||
1683 | * sysdevs of nodes that have memory. All on-line nodes should have | ||
1684 | * registered their associated sysdev by this time. | ||
1685 | */ | ||
1686 | static void hugetlb_register_all_nodes(void) | ||
1687 | { | ||
1688 | int nid; | ||
1689 | |||
1690 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
1691 | struct node *node = &node_devices[nid]; | ||
1692 | if (node->sysdev.id == nid) | ||
1693 | hugetlb_register_node(node); | ||
1694 | } | ||
1695 | |||
1696 | /* | ||
1697 | * Let the node sysdev driver know we're here so it can | ||
1698 | * [un]register hstate attributes on node hotplug. | ||
1699 | */ | ||
1700 | register_hugetlbfs_with_node(hugetlb_register_node, | ||
1701 | hugetlb_unregister_node); | ||
1702 | } | ||
1703 | #else /* !CONFIG_NUMA */ | ||
1704 | |||
1705 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | ||
1706 | { | ||
1707 | BUG(); | ||
1708 | if (nidp) | ||
1709 | *nidp = -1; | ||
1710 | return NULL; | ||
1711 | } | ||
1712 | |||
1713 | static void hugetlb_unregister_all_nodes(void) { } | ||
1714 | |||
1715 | static void hugetlb_register_all_nodes(void) { } | ||
1716 | |||
1717 | #endif | ||
1718 | |||
1414 | static void __exit hugetlb_exit(void) | 1719 | static void __exit hugetlb_exit(void) |
1415 | { | 1720 | { |
1416 | struct hstate *h; | 1721 | struct hstate *h; |
1417 | 1722 | ||
1723 | hugetlb_unregister_all_nodes(); | ||
1724 | |||
1418 | for_each_hstate(h) { | 1725 | for_each_hstate(h) { |
1419 | kobject_put(hstate_kobjs[h - hstates]); | 1726 | kobject_put(hstate_kobjs[h - hstates]); |
1420 | } | 1727 | } |
@@ -1449,6 +1756,8 @@ static int __init hugetlb_init(void) | |||
1449 | 1756 | ||
1450 | hugetlb_sysfs_init(); | 1757 | hugetlb_sysfs_init(); |
1451 | 1758 | ||
1759 | hugetlb_register_all_nodes(); | ||
1760 | |||
1452 | return 0; | 1761 | return 0; |
1453 | } | 1762 | } |
1454 | module_init(hugetlb_init); | 1763 | module_init(hugetlb_init); |
@@ -1472,8 +1781,8 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1472 | h->free_huge_pages = 0; | 1781 | h->free_huge_pages = 0; |
1473 | for (i = 0; i < MAX_NUMNODES; ++i) | 1782 | for (i = 0; i < MAX_NUMNODES; ++i) |
1474 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1783 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1475 | h->next_nid_to_alloc = first_node(node_online_map); | 1784 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); |
1476 | h->next_nid_to_free = first_node(node_online_map); | 1785 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); |
1477 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1786 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1478 | huge_page_size(h)/1024); | 1787 | huge_page_size(h)/1024); |
1479 | 1788 | ||
@@ -1536,9 +1845,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
1536 | } | 1845 | } |
1537 | 1846 | ||
1538 | #ifdef CONFIG_SYSCTL | 1847 | #ifdef CONFIG_SYSCTL |
1539 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1848 | static int hugetlb_sysctl_handler_common(bool obey_mempolicy, |
1540 | void __user *buffer, | 1849 | struct ctl_table *table, int write, |
1541 | size_t *length, loff_t *ppos) | 1850 | void __user *buffer, size_t *length, loff_t *ppos) |
1542 | { | 1851 | { |
1543 | struct hstate *h = &default_hstate; | 1852 | struct hstate *h = &default_hstate; |
1544 | unsigned long tmp; | 1853 | unsigned long tmp; |
@@ -1550,12 +1859,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, | |||
1550 | table->maxlen = sizeof(unsigned long); | 1859 | table->maxlen = sizeof(unsigned long); |
1551 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1860 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1552 | 1861 | ||
1553 | if (write) | 1862 | if (write) { |
1554 | h->max_huge_pages = set_max_huge_pages(h, tmp); | 1863 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, |
1864 | GFP_KERNEL | __GFP_NORETRY); | ||
1865 | if (!(obey_mempolicy && | ||
1866 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
1867 | NODEMASK_FREE(nodes_allowed); | ||
1868 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
1869 | } | ||
1870 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); | ||
1871 | |||
1872 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | ||
1873 | NODEMASK_FREE(nodes_allowed); | ||
1874 | } | ||
1555 | 1875 | ||
1556 | return 0; | 1876 | return 0; |
1557 | } | 1877 | } |
1558 | 1878 | ||
1879 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | ||
1880 | void __user *buffer, size_t *length, loff_t *ppos) | ||
1881 | { | ||
1882 | |||
1883 | return hugetlb_sysctl_handler_common(false, table, write, | ||
1884 | buffer, length, ppos); | ||
1885 | } | ||
1886 | |||
1887 | #ifdef CONFIG_NUMA | ||
1888 | int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, | ||
1889 | void __user *buffer, size_t *length, loff_t *ppos) | ||
1890 | { | ||
1891 | return hugetlb_sysctl_handler_common(true, table, write, | ||
1892 | buffer, length, ppos); | ||
1893 | } | ||
1894 | #endif /* CONFIG_NUMA */ | ||
1895 | |||
1559 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, | 1896 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, |
1560 | void __user *buffer, | 1897 | void __user *buffer, |
1561 | size_t *length, loff_t *ppos) | 1898 | size_t *length, loff_t *ppos) |
@@ -1903,6 +2240,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1903 | + (vma->vm_pgoff >> PAGE_SHIFT); | 2240 | + (vma->vm_pgoff >> PAGE_SHIFT); |
1904 | mapping = (struct address_space *)page_private(page); | 2241 | mapping = (struct address_space *)page_private(page); |
1905 | 2242 | ||
2243 | /* | ||
2244 | * Take the mapping lock for the duration of the table walk. As | ||
2245 | * this mapping should be shared between all the VMAs, | ||
2246 | * __unmap_hugepage_range() is called as the lock is already held | ||
2247 | */ | ||
2248 | spin_lock(&mapping->i_mmap_lock); | ||
1906 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 2249 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1907 | /* Do not unmap the current VMA */ | 2250 | /* Do not unmap the current VMA */ |
1908 | if (iter_vma == vma) | 2251 | if (iter_vma == vma) |
@@ -1916,10 +2259,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1916 | * from the time of fork. This would look like data corruption | 2259 | * from the time of fork. This would look like data corruption |
1917 | */ | 2260 | */ |
1918 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | 2261 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) |
1919 | unmap_hugepage_range(iter_vma, | 2262 | __unmap_hugepage_range(iter_vma, |
1920 | address, address + huge_page_size(h), | 2263 | address, address + huge_page_size(h), |
1921 | page); | 2264 | page); |
1922 | } | 2265 | } |
2266 | spin_unlock(&mapping->i_mmap_lock); | ||
1923 | 2267 | ||
1924 | return 1; | 2268 | return 1; |
1925 | } | 2269 | } |
@@ -1959,6 +2303,9 @@ retry_avoidcopy: | |||
1959 | outside_reserve = 1; | 2303 | outside_reserve = 1; |
1960 | 2304 | ||
1961 | page_cache_get(old_page); | 2305 | page_cache_get(old_page); |
2306 | |||
2307 | /* Drop page_table_lock as buddy allocator may be called */ | ||
2308 | spin_unlock(&mm->page_table_lock); | ||
1962 | new_page = alloc_huge_page(vma, address, outside_reserve); | 2309 | new_page = alloc_huge_page(vma, address, outside_reserve); |
1963 | 2310 | ||
1964 | if (IS_ERR(new_page)) { | 2311 | if (IS_ERR(new_page)) { |
@@ -1976,19 +2323,25 @@ retry_avoidcopy: | |||
1976 | if (unmap_ref_private(mm, vma, old_page, address)) { | 2323 | if (unmap_ref_private(mm, vma, old_page, address)) { |
1977 | BUG_ON(page_count(old_page) != 1); | 2324 | BUG_ON(page_count(old_page) != 1); |
1978 | BUG_ON(huge_pte_none(pte)); | 2325 | BUG_ON(huge_pte_none(pte)); |
2326 | spin_lock(&mm->page_table_lock); | ||
1979 | goto retry_avoidcopy; | 2327 | goto retry_avoidcopy; |
1980 | } | 2328 | } |
1981 | WARN_ON_ONCE(1); | 2329 | WARN_ON_ONCE(1); |
1982 | } | 2330 | } |
1983 | 2331 | ||
2332 | /* Caller expects lock to be held */ | ||
2333 | spin_lock(&mm->page_table_lock); | ||
1984 | return -PTR_ERR(new_page); | 2334 | return -PTR_ERR(new_page); |
1985 | } | 2335 | } |
1986 | 2336 | ||
1987 | spin_unlock(&mm->page_table_lock); | ||
1988 | copy_huge_page(new_page, old_page, address, vma); | 2337 | copy_huge_page(new_page, old_page, address, vma); |
1989 | __SetPageUptodate(new_page); | 2338 | __SetPageUptodate(new_page); |
1990 | spin_lock(&mm->page_table_lock); | ||
1991 | 2339 | ||
2340 | /* | ||
2341 | * Retake the page_table_lock to check for racing updates | ||
2342 | * before the page tables are altered | ||
2343 | */ | ||
2344 | spin_lock(&mm->page_table_lock); | ||
1992 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2345 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
1993 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 2346 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
1994 | /* Break COW */ | 2347 | /* Break COW */ |