aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c551
1 files changed, 452 insertions, 99 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5d7601b02874..65f38c218207 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,6 +24,7 @@
24#include <asm/io.h> 24#include <asm/io.h>
25 25
26#include <linux/hugetlb.h> 26#include <linux/hugetlb.h>
27#include <linux/node.h>
27#include "internal.h" 28#include "internal.h"
28 29
29const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 30const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -622,42 +623,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
622} 623}
623 624
624/* 625/*
625 * Use a helper variable to find the next node and then 626 * common helper functions for hstate_next_node_to_{alloc|free}.
626 * copy it back to next_nid_to_alloc afterwards: 627 * We may have allocated or freed a huge page based on a different
627 * otherwise there's a window in which a racer might 628 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
628 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. 629 * be outside of *nodes_allowed. Ensure that we use an allowed
629 * But we don't need to use a spin_lock here: it really 630 * node for alloc or free.
630 * doesn't matter if occasionally a racer chooses the
631 * same nid as we do. Move nid forward in the mask even
632 * if we just successfully allocated a hugepage so that
633 * the next caller gets hugepages on the next node.
634 */ 631 */
635static int hstate_next_node_to_alloc(struct hstate *h) 632static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
636{ 633{
637 int next_nid; 634 nid = next_node(nid, *nodes_allowed);
638 next_nid = next_node(h->next_nid_to_alloc, node_online_map); 635 if (nid == MAX_NUMNODES)
639 if (next_nid == MAX_NUMNODES) 636 nid = first_node(*nodes_allowed);
640 next_nid = first_node(node_online_map); 637 VM_BUG_ON(nid >= MAX_NUMNODES);
641 h->next_nid_to_alloc = next_nid; 638
642 return next_nid; 639 return nid;
640}
641
642static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
643{
644 if (!node_isset(nid, *nodes_allowed))
645 nid = next_node_allowed(nid, nodes_allowed);
646 return nid;
647}
648
649/*
650 * returns the previously saved node ["this node"] from which to
651 * allocate a persistent huge page for the pool and advance the
652 * next node from which to allocate, handling wrap at end of node
653 * mask.
654 */
655static int hstate_next_node_to_alloc(struct hstate *h,
656 nodemask_t *nodes_allowed)
657{
658 int nid;
659
660 VM_BUG_ON(!nodes_allowed);
661
662 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
663 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
664
665 return nid;
643} 666}
644 667
645static int alloc_fresh_huge_page(struct hstate *h) 668static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
646{ 669{
647 struct page *page; 670 struct page *page;
648 int start_nid; 671 int start_nid;
649 int next_nid; 672 int next_nid;
650 int ret = 0; 673 int ret = 0;
651 674
652 start_nid = h->next_nid_to_alloc; 675 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
653 next_nid = start_nid; 676 next_nid = start_nid;
654 677
655 do { 678 do {
656 page = alloc_fresh_huge_page_node(h, next_nid); 679 page = alloc_fresh_huge_page_node(h, next_nid);
657 if (page) 680 if (page) {
658 ret = 1; 681 ret = 1;
659 next_nid = hstate_next_node_to_alloc(h); 682 break;
660 } while (!page && next_nid != start_nid); 683 }
684 next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
685 } while (next_nid != start_nid);
661 686
662 if (ret) 687 if (ret)
663 count_vm_event(HTLB_BUDDY_PGALLOC); 688 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -668,17 +693,21 @@ static int alloc_fresh_huge_page(struct hstate *h)
668} 693}
669 694
670/* 695/*
671 * helper for free_pool_huge_page() - find next node 696 * helper for free_pool_huge_page() - return the previously saved
672 * from which to free a huge page 697 * node ["this node"] from which to free a huge page. Advance the
698 * next node id whether or not we find a free huge page to free so
699 * that the next attempt to free addresses the next node.
673 */ 700 */
674static int hstate_next_node_to_free(struct hstate *h) 701static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
675{ 702{
676 int next_nid; 703 int nid;
677 next_nid = next_node(h->next_nid_to_free, node_online_map); 704
678 if (next_nid == MAX_NUMNODES) 705 VM_BUG_ON(!nodes_allowed);
679 next_nid = first_node(node_online_map); 706
680 h->next_nid_to_free = next_nid; 707 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
681 return next_nid; 708 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
709
710 return nid;
682} 711}
683 712
684/* 713/*
@@ -687,13 +716,14 @@ static int hstate_next_node_to_free(struct hstate *h)
687 * balanced over allowed nodes. 716 * balanced over allowed nodes.
688 * Called with hugetlb_lock locked. 717 * Called with hugetlb_lock locked.
689 */ 718 */
690static int free_pool_huge_page(struct hstate *h, bool acct_surplus) 719static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
720 bool acct_surplus)
691{ 721{
692 int start_nid; 722 int start_nid;
693 int next_nid; 723 int next_nid;
694 int ret = 0; 724 int ret = 0;
695 725
696 start_nid = h->next_nid_to_free; 726 start_nid = hstate_next_node_to_free(h, nodes_allowed);
697 next_nid = start_nid; 727 next_nid = start_nid;
698 728
699 do { 729 do {
@@ -715,9 +745,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
715 } 745 }
716 update_and_free_page(h, page); 746 update_and_free_page(h, page);
717 ret = 1; 747 ret = 1;
748 break;
718 } 749 }
719 next_nid = hstate_next_node_to_free(h); 750 next_nid = hstate_next_node_to_free(h, nodes_allowed);
720 } while (!ret && next_nid != start_nid); 751 } while (next_nid != start_nid);
721 752
722 return ret; 753 return ret;
723} 754}
@@ -911,14 +942,14 @@ static void return_unused_surplus_pages(struct hstate *h,
911 942
912 /* 943 /*
913 * We want to release as many surplus pages as possible, spread 944 * We want to release as many surplus pages as possible, spread
914 * evenly across all nodes. Iterate across all nodes until we 945 * evenly across all nodes with memory. Iterate across these nodes
915 * can no longer free unreserved surplus pages. This occurs when 946 * until we can no longer free unreserved surplus pages. This occurs
916 * the nodes with surplus pages have no free pages. 947 * when the nodes with surplus pages have no free pages.
917 * free_pool_huge_page() will balance the the frees across the 948 * free_pool_huge_page() will balance the the freed pages across the
918 * on-line nodes for us and will handle the hstate accounting. 949 * on-line nodes with memory and will handle the hstate accounting.
919 */ 950 */
920 while (nr_pages--) { 951 while (nr_pages--) {
921 if (!free_pool_huge_page(h, 1)) 952 if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
922 break; 953 break;
923 } 954 }
924} 955}
@@ -1022,16 +1053,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1022int __weak alloc_bootmem_huge_page(struct hstate *h) 1053int __weak alloc_bootmem_huge_page(struct hstate *h)
1023{ 1054{
1024 struct huge_bootmem_page *m; 1055 struct huge_bootmem_page *m;
1025 int nr_nodes = nodes_weight(node_online_map); 1056 int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
1026 1057
1027 while (nr_nodes) { 1058 while (nr_nodes) {
1028 void *addr; 1059 void *addr;
1029 1060
1030 addr = __alloc_bootmem_node_nopanic( 1061 addr = __alloc_bootmem_node_nopanic(
1031 NODE_DATA(h->next_nid_to_alloc), 1062 NODE_DATA(hstate_next_node_to_alloc(h,
1063 &node_states[N_HIGH_MEMORY])),
1032 huge_page_size(h), huge_page_size(h), 0); 1064 huge_page_size(h), huge_page_size(h), 0);
1033 1065
1034 hstate_next_node_to_alloc(h);
1035 if (addr) { 1066 if (addr) {
1036 /* 1067 /*
1037 * Use the beginning of the huge page to store the 1068 * Use the beginning of the huge page to store the
@@ -1084,7 +1115,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1084 if (h->order >= MAX_ORDER) { 1115 if (h->order >= MAX_ORDER) {
1085 if (!alloc_bootmem_huge_page(h)) 1116 if (!alloc_bootmem_huge_page(h))
1086 break; 1117 break;
1087 } else if (!alloc_fresh_huge_page(h)) 1118 } else if (!alloc_fresh_huge_page(h,
1119 &node_states[N_HIGH_MEMORY]))
1088 break; 1120 break;
1089 } 1121 }
1090 h->max_huge_pages = i; 1122 h->max_huge_pages = i;
@@ -1126,14 +1158,15 @@ static void __init report_hugepages(void)
1126} 1158}
1127 1159
1128#ifdef CONFIG_HIGHMEM 1160#ifdef CONFIG_HIGHMEM
1129static void try_to_free_low(struct hstate *h, unsigned long count) 1161static void try_to_free_low(struct hstate *h, unsigned long count,
1162 nodemask_t *nodes_allowed)
1130{ 1163{
1131 int i; 1164 int i;
1132 1165
1133 if (h->order >= MAX_ORDER) 1166 if (h->order >= MAX_ORDER)
1134 return; 1167 return;
1135 1168
1136 for (i = 0; i < MAX_NUMNODES; ++i) { 1169 for_each_node_mask(i, *nodes_allowed) {
1137 struct page *page, *next; 1170 struct page *page, *next;
1138 struct list_head *freel = &h->hugepage_freelists[i]; 1171 struct list_head *freel = &h->hugepage_freelists[i];
1139 list_for_each_entry_safe(page, next, freel, lru) { 1172 list_for_each_entry_safe(page, next, freel, lru) {
@@ -1149,7 +1182,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count)
1149 } 1182 }
1150} 1183}
1151#else 1184#else
1152static inline void try_to_free_low(struct hstate *h, unsigned long count) 1185static inline void try_to_free_low(struct hstate *h, unsigned long count,
1186 nodemask_t *nodes_allowed)
1153{ 1187{
1154} 1188}
1155#endif 1189#endif
@@ -1159,7 +1193,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1159 * balanced by operating on them in a round-robin fashion. 1193 * balanced by operating on them in a round-robin fashion.
1160 * Returns 1 if an adjustment was made. 1194 * Returns 1 if an adjustment was made.
1161 */ 1195 */
1162static int adjust_pool_surplus(struct hstate *h, int delta) 1196static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
1197 int delta)
1163{ 1198{
1164 int start_nid, next_nid; 1199 int start_nid, next_nid;
1165 int ret = 0; 1200 int ret = 0;
@@ -1167,29 +1202,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
1167 VM_BUG_ON(delta != -1 && delta != 1); 1202 VM_BUG_ON(delta != -1 && delta != 1);
1168 1203
1169 if (delta < 0) 1204 if (delta < 0)
1170 start_nid = h->next_nid_to_alloc; 1205 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
1171 else 1206 else
1172 start_nid = h->next_nid_to_free; 1207 start_nid = hstate_next_node_to_free(h, nodes_allowed);
1173 next_nid = start_nid; 1208 next_nid = start_nid;
1174 1209
1175 do { 1210 do {
1176 int nid = next_nid; 1211 int nid = next_nid;
1177 if (delta < 0) { 1212 if (delta < 0) {
1178 next_nid = hstate_next_node_to_alloc(h);
1179 /* 1213 /*
1180 * To shrink on this node, there must be a surplus page 1214 * To shrink on this node, there must be a surplus page
1181 */ 1215 */
1182 if (!h->surplus_huge_pages_node[nid]) 1216 if (!h->surplus_huge_pages_node[nid]) {
1217 next_nid = hstate_next_node_to_alloc(h,
1218 nodes_allowed);
1183 continue; 1219 continue;
1220 }
1184 } 1221 }
1185 if (delta > 0) { 1222 if (delta > 0) {
1186 next_nid = hstate_next_node_to_free(h);
1187 /* 1223 /*
1188 * Surplus cannot exceed the total number of pages 1224 * Surplus cannot exceed the total number of pages
1189 */ 1225 */
1190 if (h->surplus_huge_pages_node[nid] >= 1226 if (h->surplus_huge_pages_node[nid] >=
1191 h->nr_huge_pages_node[nid]) 1227 h->nr_huge_pages_node[nid]) {
1228 next_nid = hstate_next_node_to_free(h,
1229 nodes_allowed);
1192 continue; 1230 continue;
1231 }
1193 } 1232 }
1194 1233
1195 h->surplus_huge_pages += delta; 1234 h->surplus_huge_pages += delta;
@@ -1202,7 +1241,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
1202} 1241}
1203 1242
1204#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1243#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1205static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) 1244static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1245 nodemask_t *nodes_allowed)
1206{ 1246{
1207 unsigned long min_count, ret; 1247 unsigned long min_count, ret;
1208 1248
@@ -1222,7 +1262,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1222 */ 1262 */
1223 spin_lock(&hugetlb_lock); 1263 spin_lock(&hugetlb_lock);
1224 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 1264 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
1225 if (!adjust_pool_surplus(h, -1)) 1265 if (!adjust_pool_surplus(h, nodes_allowed, -1))
1226 break; 1266 break;
1227 } 1267 }
1228 1268
@@ -1233,11 +1273,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1233 * and reducing the surplus. 1273 * and reducing the surplus.
1234 */ 1274 */
1235 spin_unlock(&hugetlb_lock); 1275 spin_unlock(&hugetlb_lock);
1236 ret = alloc_fresh_huge_page(h); 1276 ret = alloc_fresh_huge_page(h, nodes_allowed);
1237 spin_lock(&hugetlb_lock); 1277 spin_lock(&hugetlb_lock);
1238 if (!ret) 1278 if (!ret)
1239 goto out; 1279 goto out;
1240 1280
1281 /* Bail for signals. Probably ctrl-c from user */
1282 if (signal_pending(current))
1283 goto out;
1241 } 1284 }
1242 1285
1243 /* 1286 /*
@@ -1257,13 +1300,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1257 */ 1300 */
1258 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 1301 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
1259 min_count = max(count, min_count); 1302 min_count = max(count, min_count);
1260 try_to_free_low(h, min_count); 1303 try_to_free_low(h, min_count, nodes_allowed);
1261 while (min_count < persistent_huge_pages(h)) { 1304 while (min_count < persistent_huge_pages(h)) {
1262 if (!free_pool_huge_page(h, 0)) 1305 if (!free_pool_huge_page(h, nodes_allowed, 0))
1263 break; 1306 break;
1264 } 1307 }
1265 while (count < persistent_huge_pages(h)) { 1308 while (count < persistent_huge_pages(h)) {
1266 if (!adjust_pool_surplus(h, 1)) 1309 if (!adjust_pool_surplus(h, nodes_allowed, 1))
1267 break; 1310 break;
1268 } 1311 }
1269out: 1312out:
@@ -1282,43 +1325,117 @@ out:
1282static struct kobject *hugepages_kobj; 1325static struct kobject *hugepages_kobj;
1283static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 1326static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1284 1327
1285static struct hstate *kobj_to_hstate(struct kobject *kobj) 1328static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
1329
1330static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
1286{ 1331{
1287 int i; 1332 int i;
1333
1288 for (i = 0; i < HUGE_MAX_HSTATE; i++) 1334 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1289 if (hstate_kobjs[i] == kobj) 1335 if (hstate_kobjs[i] == kobj) {
1336 if (nidp)
1337 *nidp = NUMA_NO_NODE;
1290 return &hstates[i]; 1338 return &hstates[i];
1291 BUG(); 1339 }
1292 return NULL; 1340
1341 return kobj_to_node_hstate(kobj, nidp);
1293} 1342}
1294 1343
1295static ssize_t nr_hugepages_show(struct kobject *kobj, 1344static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1296 struct kobj_attribute *attr, char *buf) 1345 struct kobj_attribute *attr, char *buf)
1297{ 1346{
1298 struct hstate *h = kobj_to_hstate(kobj); 1347 struct hstate *h;
1299 return sprintf(buf, "%lu\n", h->nr_huge_pages); 1348 unsigned long nr_huge_pages;
1349 int nid;
1350
1351 h = kobj_to_hstate(kobj, &nid);
1352 if (nid == NUMA_NO_NODE)
1353 nr_huge_pages = h->nr_huge_pages;
1354 else
1355 nr_huge_pages = h->nr_huge_pages_node[nid];
1356
1357 return sprintf(buf, "%lu\n", nr_huge_pages);
1300} 1358}
1301static ssize_t nr_hugepages_store(struct kobject *kobj, 1359static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1302 struct kobj_attribute *attr, const char *buf, size_t count) 1360 struct kobject *kobj, struct kobj_attribute *attr,
1361 const char *buf, size_t len)
1303{ 1362{
1304 int err; 1363 int err;
1305 unsigned long input; 1364 int nid;
1306 struct hstate *h = kobj_to_hstate(kobj); 1365 unsigned long count;
1366 struct hstate *h;
1367 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1307 1368
1308 err = strict_strtoul(buf, 10, &input); 1369 err = strict_strtoul(buf, 10, &count);
1309 if (err) 1370 if (err)
1310 return 0; 1371 return 0;
1311 1372
1312 h->max_huge_pages = set_max_huge_pages(h, input); 1373 h = kobj_to_hstate(kobj, &nid);
1374 if (nid == NUMA_NO_NODE) {
1375 /*
1376 * global hstate attribute
1377 */
1378 if (!(obey_mempolicy &&
1379 init_nodemask_of_mempolicy(nodes_allowed))) {
1380 NODEMASK_FREE(nodes_allowed);
1381 nodes_allowed = &node_states[N_HIGH_MEMORY];
1382 }
1383 } else if (nodes_allowed) {
1384 /*
1385 * per node hstate attribute: adjust count to global,
1386 * but restrict alloc/free to the specified node.
1387 */
1388 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
1389 init_nodemask_of_node(nodes_allowed, nid);
1390 } else
1391 nodes_allowed = &node_states[N_HIGH_MEMORY];
1392
1393 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1313 1394
1314 return count; 1395 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1396 NODEMASK_FREE(nodes_allowed);
1397
1398 return len;
1399}
1400
1401static ssize_t nr_hugepages_show(struct kobject *kobj,
1402 struct kobj_attribute *attr, char *buf)
1403{
1404 return nr_hugepages_show_common(kobj, attr, buf);
1405}
1406
1407static ssize_t nr_hugepages_store(struct kobject *kobj,
1408 struct kobj_attribute *attr, const char *buf, size_t len)
1409{
1410 return nr_hugepages_store_common(false, kobj, attr, buf, len);
1315} 1411}
1316HSTATE_ATTR(nr_hugepages); 1412HSTATE_ATTR(nr_hugepages);
1317 1413
1414#ifdef CONFIG_NUMA
1415
1416/*
1417 * hstate attribute for optionally mempolicy-based constraint on persistent
1418 * huge page alloc/free.
1419 */
1420static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
1421 struct kobj_attribute *attr, char *buf)
1422{
1423 return nr_hugepages_show_common(kobj, attr, buf);
1424}
1425
1426static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
1427 struct kobj_attribute *attr, const char *buf, size_t len)
1428{
1429 return nr_hugepages_store_common(true, kobj, attr, buf, len);
1430}
1431HSTATE_ATTR(nr_hugepages_mempolicy);
1432#endif
1433
1434
1318static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 1435static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1319 struct kobj_attribute *attr, char *buf) 1436 struct kobj_attribute *attr, char *buf)
1320{ 1437{
1321 struct hstate *h = kobj_to_hstate(kobj); 1438 struct hstate *h = kobj_to_hstate(kobj, NULL);
1322 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 1439 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1323} 1440}
1324static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 1441static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
@@ -1326,7 +1443,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1326{ 1443{
1327 int err; 1444 int err;
1328 unsigned long input; 1445 unsigned long input;
1329 struct hstate *h = kobj_to_hstate(kobj); 1446 struct hstate *h = kobj_to_hstate(kobj, NULL);
1330 1447
1331 err = strict_strtoul(buf, 10, &input); 1448 err = strict_strtoul(buf, 10, &input);
1332 if (err) 1449 if (err)
@@ -1343,15 +1460,24 @@ HSTATE_ATTR(nr_overcommit_hugepages);
1343static ssize_t free_hugepages_show(struct kobject *kobj, 1460static ssize_t free_hugepages_show(struct kobject *kobj,
1344 struct kobj_attribute *attr, char *buf) 1461 struct kobj_attribute *attr, char *buf)
1345{ 1462{
1346 struct hstate *h = kobj_to_hstate(kobj); 1463 struct hstate *h;
1347 return sprintf(buf, "%lu\n", h->free_huge_pages); 1464 unsigned long free_huge_pages;
1465 int nid;
1466
1467 h = kobj_to_hstate(kobj, &nid);
1468 if (nid == NUMA_NO_NODE)
1469 free_huge_pages = h->free_huge_pages;
1470 else
1471 free_huge_pages = h->free_huge_pages_node[nid];
1472
1473 return sprintf(buf, "%lu\n", free_huge_pages);
1348} 1474}
1349HSTATE_ATTR_RO(free_hugepages); 1475HSTATE_ATTR_RO(free_hugepages);
1350 1476
1351static ssize_t resv_hugepages_show(struct kobject *kobj, 1477static ssize_t resv_hugepages_show(struct kobject *kobj,
1352 struct kobj_attribute *attr, char *buf) 1478 struct kobj_attribute *attr, char *buf)
1353{ 1479{
1354 struct hstate *h = kobj_to_hstate(kobj); 1480 struct hstate *h = kobj_to_hstate(kobj, NULL);
1355 return sprintf(buf, "%lu\n", h->resv_huge_pages); 1481 return sprintf(buf, "%lu\n", h->resv_huge_pages);
1356} 1482}
1357HSTATE_ATTR_RO(resv_hugepages); 1483HSTATE_ATTR_RO(resv_hugepages);
@@ -1359,8 +1485,17 @@ HSTATE_ATTR_RO(resv_hugepages);
1359static ssize_t surplus_hugepages_show(struct kobject *kobj, 1485static ssize_t surplus_hugepages_show(struct kobject *kobj,
1360 struct kobj_attribute *attr, char *buf) 1486 struct kobj_attribute *attr, char *buf)
1361{ 1487{
1362 struct hstate *h = kobj_to_hstate(kobj); 1488 struct hstate *h;
1363 return sprintf(buf, "%lu\n", h->surplus_huge_pages); 1489 unsigned long surplus_huge_pages;
1490 int nid;
1491
1492 h = kobj_to_hstate(kobj, &nid);
1493 if (nid == NUMA_NO_NODE)
1494 surplus_huge_pages = h->surplus_huge_pages;
1495 else
1496 surplus_huge_pages = h->surplus_huge_pages_node[nid];
1497
1498 return sprintf(buf, "%lu\n", surplus_huge_pages);
1364} 1499}
1365HSTATE_ATTR_RO(surplus_hugepages); 1500HSTATE_ATTR_RO(surplus_hugepages);
1366 1501
@@ -1370,6 +1505,9 @@ static struct attribute *hstate_attrs[] = {
1370 &free_hugepages_attr.attr, 1505 &free_hugepages_attr.attr,
1371 &resv_hugepages_attr.attr, 1506 &resv_hugepages_attr.attr,
1372 &surplus_hugepages_attr.attr, 1507 &surplus_hugepages_attr.attr,
1508#ifdef CONFIG_NUMA
1509 &nr_hugepages_mempolicy_attr.attr,
1510#endif
1373 NULL, 1511 NULL,
1374}; 1512};
1375 1513
@@ -1377,19 +1515,21 @@ static struct attribute_group hstate_attr_group = {
1377 .attrs = hstate_attrs, 1515 .attrs = hstate_attrs,
1378}; 1516};
1379 1517
1380static int __init hugetlb_sysfs_add_hstate(struct hstate *h) 1518static int __init hugetlb_sysfs_add_hstate(struct hstate *h,
1519 struct kobject *parent,
1520 struct kobject **hstate_kobjs,
1521 struct attribute_group *hstate_attr_group)
1381{ 1522{
1382 int retval; 1523 int retval;
1524 int hi = h - hstates;
1383 1525
1384 hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, 1526 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1385 hugepages_kobj); 1527 if (!hstate_kobjs[hi])
1386 if (!hstate_kobjs[h - hstates])
1387 return -ENOMEM; 1528 return -ENOMEM;
1388 1529
1389 retval = sysfs_create_group(hstate_kobjs[h - hstates], 1530 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
1390 &hstate_attr_group);
1391 if (retval) 1531 if (retval)
1392 kobject_put(hstate_kobjs[h - hstates]); 1532 kobject_put(hstate_kobjs[hi]);
1393 1533
1394 return retval; 1534 return retval;
1395} 1535}
@@ -1404,17 +1544,184 @@ static void __init hugetlb_sysfs_init(void)
1404 return; 1544 return;
1405 1545
1406 for_each_hstate(h) { 1546 for_each_hstate(h) {
1407 err = hugetlb_sysfs_add_hstate(h); 1547 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
1548 hstate_kobjs, &hstate_attr_group);
1408 if (err) 1549 if (err)
1409 printk(KERN_ERR "Hugetlb: Unable to add hstate %s", 1550 printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1410 h->name); 1551 h->name);
1411 } 1552 }
1412} 1553}
1413 1554
1555#ifdef CONFIG_NUMA
1556
1557/*
1558 * node_hstate/s - associate per node hstate attributes, via their kobjects,
1559 * with node sysdevs in node_devices[] using a parallel array. The array
1560 * index of a node sysdev or _hstate == node id.
1561 * This is here to avoid any static dependency of the node sysdev driver, in
1562 * the base kernel, on the hugetlb module.
1563 */
1564struct node_hstate {
1565 struct kobject *hugepages_kobj;
1566 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1567};
1568struct node_hstate node_hstates[MAX_NUMNODES];
1569
1570/*
1571 * A subset of global hstate attributes for node sysdevs
1572 */
1573static struct attribute *per_node_hstate_attrs[] = {
1574 &nr_hugepages_attr.attr,
1575 &free_hugepages_attr.attr,
1576 &surplus_hugepages_attr.attr,
1577 NULL,
1578};
1579
1580static struct attribute_group per_node_hstate_attr_group = {
1581 .attrs = per_node_hstate_attrs,
1582};
1583
1584/*
1585 * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
1586 * Returns node id via non-NULL nidp.
1587 */
1588static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1589{
1590 int nid;
1591
1592 for (nid = 0; nid < nr_node_ids; nid++) {
1593 struct node_hstate *nhs = &node_hstates[nid];
1594 int i;
1595 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1596 if (nhs->hstate_kobjs[i] == kobj) {
1597 if (nidp)
1598 *nidp = nid;
1599 return &hstates[i];
1600 }
1601 }
1602
1603 BUG();
1604 return NULL;
1605}
1606
1607/*
1608 * Unregister hstate attributes from a single node sysdev.
1609 * No-op if no hstate attributes attached.
1610 */
1611void hugetlb_unregister_node(struct node *node)
1612{
1613 struct hstate *h;
1614 struct node_hstate *nhs = &node_hstates[node->sysdev.id];
1615
1616 if (!nhs->hugepages_kobj)
1617 return; /* no hstate attributes */
1618
1619 for_each_hstate(h)
1620 if (nhs->hstate_kobjs[h - hstates]) {
1621 kobject_put(nhs->hstate_kobjs[h - hstates]);
1622 nhs->hstate_kobjs[h - hstates] = NULL;
1623 }
1624
1625 kobject_put(nhs->hugepages_kobj);
1626 nhs->hugepages_kobj = NULL;
1627}
1628
1629/*
1630 * hugetlb module exit: unregister hstate attributes from node sysdevs
1631 * that have them.
1632 */
1633static void hugetlb_unregister_all_nodes(void)
1634{
1635 int nid;
1636
1637 /*
1638 * disable node sysdev registrations.
1639 */
1640 register_hugetlbfs_with_node(NULL, NULL);
1641
1642 /*
1643 * remove hstate attributes from any nodes that have them.
1644 */
1645 for (nid = 0; nid < nr_node_ids; nid++)
1646 hugetlb_unregister_node(&node_devices[nid]);
1647}
1648
1649/*
1650 * Register hstate attributes for a single node sysdev.
1651 * No-op if attributes already registered.
1652 */
1653void hugetlb_register_node(struct node *node)
1654{
1655 struct hstate *h;
1656 struct node_hstate *nhs = &node_hstates[node->sysdev.id];
1657 int err;
1658
1659 if (nhs->hugepages_kobj)
1660 return; /* already allocated */
1661
1662 nhs->hugepages_kobj = kobject_create_and_add("hugepages",
1663 &node->sysdev.kobj);
1664 if (!nhs->hugepages_kobj)
1665 return;
1666
1667 for_each_hstate(h) {
1668 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
1669 nhs->hstate_kobjs,
1670 &per_node_hstate_attr_group);
1671 if (err) {
1672 printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
1673 " for node %d\n",
1674 h->name, node->sysdev.id);
1675 hugetlb_unregister_node(node);
1676 break;
1677 }
1678 }
1679}
1680
1681/*
1682 * hugetlb init time: register hstate attributes for all registered node
1683 * sysdevs of nodes that have memory. All on-line nodes should have
1684 * registered their associated sysdev by this time.
1685 */
1686static void hugetlb_register_all_nodes(void)
1687{
1688 int nid;
1689
1690 for_each_node_state(nid, N_HIGH_MEMORY) {
1691 struct node *node = &node_devices[nid];
1692 if (node->sysdev.id == nid)
1693 hugetlb_register_node(node);
1694 }
1695
1696 /*
1697 * Let the node sysdev driver know we're here so it can
1698 * [un]register hstate attributes on node hotplug.
1699 */
1700 register_hugetlbfs_with_node(hugetlb_register_node,
1701 hugetlb_unregister_node);
1702}
1703#else /* !CONFIG_NUMA */
1704
1705static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1706{
1707 BUG();
1708 if (nidp)
1709 *nidp = -1;
1710 return NULL;
1711}
1712
1713static void hugetlb_unregister_all_nodes(void) { }
1714
1715static void hugetlb_register_all_nodes(void) { }
1716
1717#endif
1718
1414static void __exit hugetlb_exit(void) 1719static void __exit hugetlb_exit(void)
1415{ 1720{
1416 struct hstate *h; 1721 struct hstate *h;
1417 1722
1723 hugetlb_unregister_all_nodes();
1724
1418 for_each_hstate(h) { 1725 for_each_hstate(h) {
1419 kobject_put(hstate_kobjs[h - hstates]); 1726 kobject_put(hstate_kobjs[h - hstates]);
1420 } 1727 }
@@ -1449,6 +1756,8 @@ static int __init hugetlb_init(void)
1449 1756
1450 hugetlb_sysfs_init(); 1757 hugetlb_sysfs_init();
1451 1758
1759 hugetlb_register_all_nodes();
1760
1452 return 0; 1761 return 0;
1453} 1762}
1454module_init(hugetlb_init); 1763module_init(hugetlb_init);
@@ -1472,8 +1781,8 @@ void __init hugetlb_add_hstate(unsigned order)
1472 h->free_huge_pages = 0; 1781 h->free_huge_pages = 0;
1473 for (i = 0; i < MAX_NUMNODES; ++i) 1782 for (i = 0; i < MAX_NUMNODES; ++i)
1474 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1783 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1475 h->next_nid_to_alloc = first_node(node_online_map); 1784 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
1476 h->next_nid_to_free = first_node(node_online_map); 1785 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
1477 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1786 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1478 huge_page_size(h)/1024); 1787 huge_page_size(h)/1024);
1479 1788
@@ -1536,9 +1845,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
1536} 1845}
1537 1846
1538#ifdef CONFIG_SYSCTL 1847#ifdef CONFIG_SYSCTL
1539int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1848static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1540 void __user *buffer, 1849 struct ctl_table *table, int write,
1541 size_t *length, loff_t *ppos) 1850 void __user *buffer, size_t *length, loff_t *ppos)
1542{ 1851{
1543 struct hstate *h = &default_hstate; 1852 struct hstate *h = &default_hstate;
1544 unsigned long tmp; 1853 unsigned long tmp;
@@ -1550,12 +1859,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1550 table->maxlen = sizeof(unsigned long); 1859 table->maxlen = sizeof(unsigned long);
1551 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1860 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1552 1861
1553 if (write) 1862 if (write) {
1554 h->max_huge_pages = set_max_huge_pages(h, tmp); 1863 NODEMASK_ALLOC(nodemask_t, nodes_allowed,
1864 GFP_KERNEL | __GFP_NORETRY);
1865 if (!(obey_mempolicy &&
1866 init_nodemask_of_mempolicy(nodes_allowed))) {
1867 NODEMASK_FREE(nodes_allowed);
1868 nodes_allowed = &node_states[N_HIGH_MEMORY];
1869 }
1870 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
1871
1872 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1873 NODEMASK_FREE(nodes_allowed);
1874 }
1555 1875
1556 return 0; 1876 return 0;
1557} 1877}
1558 1878
1879int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1880 void __user *buffer, size_t *length, loff_t *ppos)
1881{
1882
1883 return hugetlb_sysctl_handler_common(false, table, write,
1884 buffer, length, ppos);
1885}
1886
1887#ifdef CONFIG_NUMA
1888int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
1889 void __user *buffer, size_t *length, loff_t *ppos)
1890{
1891 return hugetlb_sysctl_handler_common(true, table, write,
1892 buffer, length, ppos);
1893}
1894#endif /* CONFIG_NUMA */
1895
1559int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 1896int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1560 void __user *buffer, 1897 void __user *buffer,
1561 size_t *length, loff_t *ppos) 1898 size_t *length, loff_t *ppos)
@@ -1903,6 +2240,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
1903 + (vma->vm_pgoff >> PAGE_SHIFT); 2240 + (vma->vm_pgoff >> PAGE_SHIFT);
1904 mapping = (struct address_space *)page_private(page); 2241 mapping = (struct address_space *)page_private(page);
1905 2242
2243 /*
2244 * Take the mapping lock for the duration of the table walk. As
2245 * this mapping should be shared between all the VMAs,
2246 * __unmap_hugepage_range() is called as the lock is already held
2247 */
2248 spin_lock(&mapping->i_mmap_lock);
1906 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 2249 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1907 /* Do not unmap the current VMA */ 2250 /* Do not unmap the current VMA */
1908 if (iter_vma == vma) 2251 if (iter_vma == vma)
@@ -1916,10 +2259,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
1916 * from the time of fork. This would look like data corruption 2259 * from the time of fork. This would look like data corruption
1917 */ 2260 */
1918 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2261 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1919 unmap_hugepage_range(iter_vma, 2262 __unmap_hugepage_range(iter_vma,
1920 address, address + huge_page_size(h), 2263 address, address + huge_page_size(h),
1921 page); 2264 page);
1922 } 2265 }
2266 spin_unlock(&mapping->i_mmap_lock);
1923 2267
1924 return 1; 2268 return 1;
1925} 2269}
@@ -1959,6 +2303,9 @@ retry_avoidcopy:
1959 outside_reserve = 1; 2303 outside_reserve = 1;
1960 2304
1961 page_cache_get(old_page); 2305 page_cache_get(old_page);
2306
2307 /* Drop page_table_lock as buddy allocator may be called */
2308 spin_unlock(&mm->page_table_lock);
1962 new_page = alloc_huge_page(vma, address, outside_reserve); 2309 new_page = alloc_huge_page(vma, address, outside_reserve);
1963 2310
1964 if (IS_ERR(new_page)) { 2311 if (IS_ERR(new_page)) {
@@ -1976,19 +2323,25 @@ retry_avoidcopy:
1976 if (unmap_ref_private(mm, vma, old_page, address)) { 2323 if (unmap_ref_private(mm, vma, old_page, address)) {
1977 BUG_ON(page_count(old_page) != 1); 2324 BUG_ON(page_count(old_page) != 1);
1978 BUG_ON(huge_pte_none(pte)); 2325 BUG_ON(huge_pte_none(pte));
2326 spin_lock(&mm->page_table_lock);
1979 goto retry_avoidcopy; 2327 goto retry_avoidcopy;
1980 } 2328 }
1981 WARN_ON_ONCE(1); 2329 WARN_ON_ONCE(1);
1982 } 2330 }
1983 2331
2332 /* Caller expects lock to be held */
2333 spin_lock(&mm->page_table_lock);
1984 return -PTR_ERR(new_page); 2334 return -PTR_ERR(new_page);
1985 } 2335 }
1986 2336
1987 spin_unlock(&mm->page_table_lock);
1988 copy_huge_page(new_page, old_page, address, vma); 2337 copy_huge_page(new_page, old_page, address, vma);
1989 __SetPageUptodate(new_page); 2338 __SetPageUptodate(new_page);
1990 spin_lock(&mm->page_table_lock);
1991 2339
2340 /*
2341 * Retake the page_table_lock to check for racing updates
2342 * before the page tables are altered
2343 */
2344 spin_lock(&mm->page_table_lock);
1992 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2345 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
1993 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2346 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
1994 /* Break COW */ 2347 /* Break COW */