aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c565
1 files changed, 460 insertions, 105 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5d7601b02874..4c9e6bbf3772 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2,7 +2,6 @@
2 * Generic hugetlb support. 2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004 3 * (C) William Irwin, April 2004
4 */ 4 */
5#include <linux/gfp.h>
6#include <linux/list.h> 5#include <linux/list.h>
7#include <linux/init.h> 6#include <linux/init.h>
8#include <linux/module.h> 7#include <linux/module.h>
@@ -18,12 +17,14 @@
18#include <linux/mutex.h> 17#include <linux/mutex.h>
19#include <linux/bootmem.h> 18#include <linux/bootmem.h>
20#include <linux/sysfs.h> 19#include <linux/sysfs.h>
20#include <linux/slab.h>
21 21
22#include <asm/page.h> 22#include <asm/page.h>
23#include <asm/pgtable.h> 23#include <asm/pgtable.h>
24#include <asm/io.h> 24#include <asm/io.h>
25 25
26#include <linux/hugetlb.h> 26#include <linux/hugetlb.h>
27#include <linux/node.h>
27#include "internal.h" 28#include "internal.h"
28 29
29const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 30const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -401,7 +402,7 @@ static void clear_huge_page(struct page *page,
401{ 402{
402 int i; 403 int i;
403 404
404 if (unlikely(sz > MAX_ORDER_NR_PAGES)) { 405 if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
405 clear_gigantic_page(page, addr, sz); 406 clear_gigantic_page(page, addr, sz);
406 return; 407 return;
407 } 408 }
@@ -545,6 +546,7 @@ static void free_huge_page(struct page *page)
545 546
546 mapping = (struct address_space *) page_private(page); 547 mapping = (struct address_space *) page_private(page);
547 set_page_private(page, 0); 548 set_page_private(page, 0);
549 page->mapping = NULL;
548 BUG_ON(page_count(page)); 550 BUG_ON(page_count(page));
549 INIT_LIST_HEAD(&page->lru); 551 INIT_LIST_HEAD(&page->lru);
550 552
@@ -622,42 +624,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
622} 624}
623 625
624/* 626/*
625 * Use a helper variable to find the next node and then 627 * common helper functions for hstate_next_node_to_{alloc|free}.
626 * copy it back to next_nid_to_alloc afterwards: 628 * We may have allocated or freed a huge page based on a different
627 * otherwise there's a window in which a racer might 629 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
628 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. 630 * be outside of *nodes_allowed. Ensure that we use an allowed
629 * But we don't need to use a spin_lock here: it really 631 * node for alloc or free.
630 * doesn't matter if occasionally a racer chooses the
631 * same nid as we do. Move nid forward in the mask even
632 * if we just successfully allocated a hugepage so that
633 * the next caller gets hugepages on the next node.
634 */ 632 */
635static int hstate_next_node_to_alloc(struct hstate *h) 633static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
636{ 634{
637 int next_nid; 635 nid = next_node(nid, *nodes_allowed);
638 next_nid = next_node(h->next_nid_to_alloc, node_online_map); 636 if (nid == MAX_NUMNODES)
639 if (next_nid == MAX_NUMNODES) 637 nid = first_node(*nodes_allowed);
640 next_nid = first_node(node_online_map); 638 VM_BUG_ON(nid >= MAX_NUMNODES);
641 h->next_nid_to_alloc = next_nid; 639
642 return next_nid; 640 return nid;
643} 641}
644 642
645static int alloc_fresh_huge_page(struct hstate *h) 643static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
644{
645 if (!node_isset(nid, *nodes_allowed))
646 nid = next_node_allowed(nid, nodes_allowed);
647 return nid;
648}
649
650/*
651 * returns the previously saved node ["this node"] from which to
652 * allocate a persistent huge page for the pool and advance the
653 * next node from which to allocate, handling wrap at end of node
654 * mask.
655 */
656static int hstate_next_node_to_alloc(struct hstate *h,
657 nodemask_t *nodes_allowed)
658{
659 int nid;
660
661 VM_BUG_ON(!nodes_allowed);
662
663 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
664 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
665
666 return nid;
667}
668
669static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
646{ 670{
647 struct page *page; 671 struct page *page;
648 int start_nid; 672 int start_nid;
649 int next_nid; 673 int next_nid;
650 int ret = 0; 674 int ret = 0;
651 675
652 start_nid = h->next_nid_to_alloc; 676 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
653 next_nid = start_nid; 677 next_nid = start_nid;
654 678
655 do { 679 do {
656 page = alloc_fresh_huge_page_node(h, next_nid); 680 page = alloc_fresh_huge_page_node(h, next_nid);
657 if (page) 681 if (page) {
658 ret = 1; 682 ret = 1;
659 next_nid = hstate_next_node_to_alloc(h); 683 break;
660 } while (!page && next_nid != start_nid); 684 }
685 next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
686 } while (next_nid != start_nid);
661 687
662 if (ret) 688 if (ret)
663 count_vm_event(HTLB_BUDDY_PGALLOC); 689 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -668,17 +694,21 @@ static int alloc_fresh_huge_page(struct hstate *h)
668} 694}
669 695
670/* 696/*
671 * helper for free_pool_huge_page() - find next node 697 * helper for free_pool_huge_page() - return the previously saved
672 * from which to free a huge page 698 * node ["this node"] from which to free a huge page. Advance the
699 * next node id whether or not we find a free huge page to free so
700 * that the next attempt to free addresses the next node.
673 */ 701 */
674static int hstate_next_node_to_free(struct hstate *h) 702static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
675{ 703{
676 int next_nid; 704 int nid;
677 next_nid = next_node(h->next_nid_to_free, node_online_map); 705
678 if (next_nid == MAX_NUMNODES) 706 VM_BUG_ON(!nodes_allowed);
679 next_nid = first_node(node_online_map); 707
680 h->next_nid_to_free = next_nid; 708 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
681 return next_nid; 709 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
710
711 return nid;
682} 712}
683 713
684/* 714/*
@@ -687,13 +717,14 @@ static int hstate_next_node_to_free(struct hstate *h)
687 * balanced over allowed nodes. 717 * balanced over allowed nodes.
688 * Called with hugetlb_lock locked. 718 * Called with hugetlb_lock locked.
689 */ 719 */
690static int free_pool_huge_page(struct hstate *h, bool acct_surplus) 720static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
721 bool acct_surplus)
691{ 722{
692 int start_nid; 723 int start_nid;
693 int next_nid; 724 int next_nid;
694 int ret = 0; 725 int ret = 0;
695 726
696 start_nid = h->next_nid_to_free; 727 start_nid = hstate_next_node_to_free(h, nodes_allowed);
697 next_nid = start_nid; 728 next_nid = start_nid;
698 729
699 do { 730 do {
@@ -715,9 +746,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
715 } 746 }
716 update_and_free_page(h, page); 747 update_and_free_page(h, page);
717 ret = 1; 748 ret = 1;
749 break;
718 } 750 }
719 next_nid = hstate_next_node_to_free(h); 751 next_nid = hstate_next_node_to_free(h, nodes_allowed);
720 } while (!ret && next_nid != start_nid); 752 } while (next_nid != start_nid);
721 753
722 return ret; 754 return ret;
723} 755}
@@ -911,14 +943,14 @@ static void return_unused_surplus_pages(struct hstate *h,
911 943
912 /* 944 /*
913 * We want to release as many surplus pages as possible, spread 945 * We want to release as many surplus pages as possible, spread
914 * evenly across all nodes. Iterate across all nodes until we 946 * evenly across all nodes with memory. Iterate across these nodes
915 * can no longer free unreserved surplus pages. This occurs when 947 * until we can no longer free unreserved surplus pages. This occurs
916 * the nodes with surplus pages have no free pages. 948 * when the nodes with surplus pages have no free pages.
917 * free_pool_huge_page() will balance the the frees across the 949 * free_pool_huge_page() will balance the the freed pages across the
918 * on-line nodes for us and will handle the hstate accounting. 950 * on-line nodes with memory and will handle the hstate accounting.
919 */ 951 */
920 while (nr_pages--) { 952 while (nr_pages--) {
921 if (!free_pool_huge_page(h, 1)) 953 if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
922 break; 954 break;
923 } 955 }
924} 956}
@@ -1007,7 +1039,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1007 page = alloc_buddy_huge_page(h, vma, addr); 1039 page = alloc_buddy_huge_page(h, vma, addr);
1008 if (!page) { 1040 if (!page) {
1009 hugetlb_put_quota(inode->i_mapping, chg); 1041 hugetlb_put_quota(inode->i_mapping, chg);
1010 return ERR_PTR(-VM_FAULT_OOM); 1042 return ERR_PTR(-VM_FAULT_SIGBUS);
1011 } 1043 }
1012 } 1044 }
1013 1045
@@ -1022,16 +1054,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1022int __weak alloc_bootmem_huge_page(struct hstate *h) 1054int __weak alloc_bootmem_huge_page(struct hstate *h)
1023{ 1055{
1024 struct huge_bootmem_page *m; 1056 struct huge_bootmem_page *m;
1025 int nr_nodes = nodes_weight(node_online_map); 1057 int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
1026 1058
1027 while (nr_nodes) { 1059 while (nr_nodes) {
1028 void *addr; 1060 void *addr;
1029 1061
1030 addr = __alloc_bootmem_node_nopanic( 1062 addr = __alloc_bootmem_node_nopanic(
1031 NODE_DATA(h->next_nid_to_alloc), 1063 NODE_DATA(hstate_next_node_to_alloc(h,
1064 &node_states[N_HIGH_MEMORY])),
1032 huge_page_size(h), huge_page_size(h), 0); 1065 huge_page_size(h), huge_page_size(h), 0);
1033 1066
1034 hstate_next_node_to_alloc(h);
1035 if (addr) { 1067 if (addr) {
1036 /* 1068 /*
1037 * Use the beginning of the huge page to store the 1069 * Use the beginning of the huge page to store the
@@ -1084,7 +1116,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1084 if (h->order >= MAX_ORDER) { 1116 if (h->order >= MAX_ORDER) {
1085 if (!alloc_bootmem_huge_page(h)) 1117 if (!alloc_bootmem_huge_page(h))
1086 break; 1118 break;
1087 } else if (!alloc_fresh_huge_page(h)) 1119 } else if (!alloc_fresh_huge_page(h,
1120 &node_states[N_HIGH_MEMORY]))
1088 break; 1121 break;
1089 } 1122 }
1090 h->max_huge_pages = i; 1123 h->max_huge_pages = i;
@@ -1126,14 +1159,15 @@ static void __init report_hugepages(void)
1126} 1159}
1127 1160
1128#ifdef CONFIG_HIGHMEM 1161#ifdef CONFIG_HIGHMEM
1129static void try_to_free_low(struct hstate *h, unsigned long count) 1162static void try_to_free_low(struct hstate *h, unsigned long count,
1163 nodemask_t *nodes_allowed)
1130{ 1164{
1131 int i; 1165 int i;
1132 1166
1133 if (h->order >= MAX_ORDER) 1167 if (h->order >= MAX_ORDER)
1134 return; 1168 return;
1135 1169
1136 for (i = 0; i < MAX_NUMNODES; ++i) { 1170 for_each_node_mask(i, *nodes_allowed) {
1137 struct page *page, *next; 1171 struct page *page, *next;
1138 struct list_head *freel = &h->hugepage_freelists[i]; 1172 struct list_head *freel = &h->hugepage_freelists[i];
1139 list_for_each_entry_safe(page, next, freel, lru) { 1173 list_for_each_entry_safe(page, next, freel, lru) {
@@ -1149,7 +1183,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count)
1149 } 1183 }
1150} 1184}
1151#else 1185#else
1152static inline void try_to_free_low(struct hstate *h, unsigned long count) 1186static inline void try_to_free_low(struct hstate *h, unsigned long count,
1187 nodemask_t *nodes_allowed)
1153{ 1188{
1154} 1189}
1155#endif 1190#endif
@@ -1159,7 +1194,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1159 * balanced by operating on them in a round-robin fashion. 1194 * balanced by operating on them in a round-robin fashion.
1160 * Returns 1 if an adjustment was made. 1195 * Returns 1 if an adjustment was made.
1161 */ 1196 */
1162static int adjust_pool_surplus(struct hstate *h, int delta) 1197static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
1198 int delta)
1163{ 1199{
1164 int start_nid, next_nid; 1200 int start_nid, next_nid;
1165 int ret = 0; 1201 int ret = 0;
@@ -1167,29 +1203,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
1167 VM_BUG_ON(delta != -1 && delta != 1); 1203 VM_BUG_ON(delta != -1 && delta != 1);
1168 1204
1169 if (delta < 0) 1205 if (delta < 0)
1170 start_nid = h->next_nid_to_alloc; 1206 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
1171 else 1207 else
1172 start_nid = h->next_nid_to_free; 1208 start_nid = hstate_next_node_to_free(h, nodes_allowed);
1173 next_nid = start_nid; 1209 next_nid = start_nid;
1174 1210
1175 do { 1211 do {
1176 int nid = next_nid; 1212 int nid = next_nid;
1177 if (delta < 0) { 1213 if (delta < 0) {
1178 next_nid = hstate_next_node_to_alloc(h);
1179 /* 1214 /*
1180 * To shrink on this node, there must be a surplus page 1215 * To shrink on this node, there must be a surplus page
1181 */ 1216 */
1182 if (!h->surplus_huge_pages_node[nid]) 1217 if (!h->surplus_huge_pages_node[nid]) {
1218 next_nid = hstate_next_node_to_alloc(h,
1219 nodes_allowed);
1183 continue; 1220 continue;
1221 }
1184 } 1222 }
1185 if (delta > 0) { 1223 if (delta > 0) {
1186 next_nid = hstate_next_node_to_free(h);
1187 /* 1224 /*
1188 * Surplus cannot exceed the total number of pages 1225 * Surplus cannot exceed the total number of pages
1189 */ 1226 */
1190 if (h->surplus_huge_pages_node[nid] >= 1227 if (h->surplus_huge_pages_node[nid] >=
1191 h->nr_huge_pages_node[nid]) 1228 h->nr_huge_pages_node[nid]) {
1229 next_nid = hstate_next_node_to_free(h,
1230 nodes_allowed);
1192 continue; 1231 continue;
1232 }
1193 } 1233 }
1194 1234
1195 h->surplus_huge_pages += delta; 1235 h->surplus_huge_pages += delta;
@@ -1202,7 +1242,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
1202} 1242}
1203 1243
1204#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1244#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1205static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) 1245static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1246 nodemask_t *nodes_allowed)
1206{ 1247{
1207 unsigned long min_count, ret; 1248 unsigned long min_count, ret;
1208 1249
@@ -1222,7 +1263,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1222 */ 1263 */
1223 spin_lock(&hugetlb_lock); 1264 spin_lock(&hugetlb_lock);
1224 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 1265 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
1225 if (!adjust_pool_surplus(h, -1)) 1266 if (!adjust_pool_surplus(h, nodes_allowed, -1))
1226 break; 1267 break;
1227 } 1268 }
1228 1269
@@ -1233,11 +1274,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1233 * and reducing the surplus. 1274 * and reducing the surplus.
1234 */ 1275 */
1235 spin_unlock(&hugetlb_lock); 1276 spin_unlock(&hugetlb_lock);
1236 ret = alloc_fresh_huge_page(h); 1277 ret = alloc_fresh_huge_page(h, nodes_allowed);
1237 spin_lock(&hugetlb_lock); 1278 spin_lock(&hugetlb_lock);
1238 if (!ret) 1279 if (!ret)
1239 goto out; 1280 goto out;
1240 1281
1282 /* Bail for signals. Probably ctrl-c from user */
1283 if (signal_pending(current))
1284 goto out;
1241 } 1285 }
1242 1286
1243 /* 1287 /*
@@ -1257,13 +1301,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1257 */ 1301 */
1258 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 1302 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
1259 min_count = max(count, min_count); 1303 min_count = max(count, min_count);
1260 try_to_free_low(h, min_count); 1304 try_to_free_low(h, min_count, nodes_allowed);
1261 while (min_count < persistent_huge_pages(h)) { 1305 while (min_count < persistent_huge_pages(h)) {
1262 if (!free_pool_huge_page(h, 0)) 1306 if (!free_pool_huge_page(h, nodes_allowed, 0))
1263 break; 1307 break;
1264 } 1308 }
1265 while (count < persistent_huge_pages(h)) { 1309 while (count < persistent_huge_pages(h)) {
1266 if (!adjust_pool_surplus(h, 1)) 1310 if (!adjust_pool_surplus(h, nodes_allowed, 1))
1267 break; 1311 break;
1268 } 1312 }
1269out: 1313out:
@@ -1282,43 +1326,117 @@ out:
1282static struct kobject *hugepages_kobj; 1326static struct kobject *hugepages_kobj;
1283static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 1327static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1284 1328
1285static struct hstate *kobj_to_hstate(struct kobject *kobj) 1329static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
1330
1331static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
1286{ 1332{
1287 int i; 1333 int i;
1334
1288 for (i = 0; i < HUGE_MAX_HSTATE; i++) 1335 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1289 if (hstate_kobjs[i] == kobj) 1336 if (hstate_kobjs[i] == kobj) {
1337 if (nidp)
1338 *nidp = NUMA_NO_NODE;
1290 return &hstates[i]; 1339 return &hstates[i];
1291 BUG(); 1340 }
1292 return NULL; 1341
1342 return kobj_to_node_hstate(kobj, nidp);
1293} 1343}
1294 1344
1295static ssize_t nr_hugepages_show(struct kobject *kobj, 1345static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1296 struct kobj_attribute *attr, char *buf) 1346 struct kobj_attribute *attr, char *buf)
1297{ 1347{
1298 struct hstate *h = kobj_to_hstate(kobj); 1348 struct hstate *h;
1299 return sprintf(buf, "%lu\n", h->nr_huge_pages); 1349 unsigned long nr_huge_pages;
1350 int nid;
1351
1352 h = kobj_to_hstate(kobj, &nid);
1353 if (nid == NUMA_NO_NODE)
1354 nr_huge_pages = h->nr_huge_pages;
1355 else
1356 nr_huge_pages = h->nr_huge_pages_node[nid];
1357
1358 return sprintf(buf, "%lu\n", nr_huge_pages);
1300} 1359}
1301static ssize_t nr_hugepages_store(struct kobject *kobj, 1360static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1302 struct kobj_attribute *attr, const char *buf, size_t count) 1361 struct kobject *kobj, struct kobj_attribute *attr,
1362 const char *buf, size_t len)
1303{ 1363{
1304 int err; 1364 int err;
1305 unsigned long input; 1365 int nid;
1306 struct hstate *h = kobj_to_hstate(kobj); 1366 unsigned long count;
1367 struct hstate *h;
1368 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1307 1369
1308 err = strict_strtoul(buf, 10, &input); 1370 err = strict_strtoul(buf, 10, &count);
1309 if (err) 1371 if (err)
1310 return 0; 1372 return 0;
1311 1373
1312 h->max_huge_pages = set_max_huge_pages(h, input); 1374 h = kobj_to_hstate(kobj, &nid);
1375 if (nid == NUMA_NO_NODE) {
1376 /*
1377 * global hstate attribute
1378 */
1379 if (!(obey_mempolicy &&
1380 init_nodemask_of_mempolicy(nodes_allowed))) {
1381 NODEMASK_FREE(nodes_allowed);
1382 nodes_allowed = &node_states[N_HIGH_MEMORY];
1383 }
1384 } else if (nodes_allowed) {
1385 /*
1386 * per node hstate attribute: adjust count to global,
1387 * but restrict alloc/free to the specified node.
1388 */
1389 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
1390 init_nodemask_of_node(nodes_allowed, nid);
1391 } else
1392 nodes_allowed = &node_states[N_HIGH_MEMORY];
1313 1393
1314 return count; 1394 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1395
1396 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1397 NODEMASK_FREE(nodes_allowed);
1398
1399 return len;
1400}
1401
1402static ssize_t nr_hugepages_show(struct kobject *kobj,
1403 struct kobj_attribute *attr, char *buf)
1404{
1405 return nr_hugepages_show_common(kobj, attr, buf);
1406}
1407
1408static ssize_t nr_hugepages_store(struct kobject *kobj,
1409 struct kobj_attribute *attr, const char *buf, size_t len)
1410{
1411 return nr_hugepages_store_common(false, kobj, attr, buf, len);
1315} 1412}
1316HSTATE_ATTR(nr_hugepages); 1413HSTATE_ATTR(nr_hugepages);
1317 1414
1415#ifdef CONFIG_NUMA
1416
1417/*
1418 * hstate attribute for optionally mempolicy-based constraint on persistent
1419 * huge page alloc/free.
1420 */
1421static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
1422 struct kobj_attribute *attr, char *buf)
1423{
1424 return nr_hugepages_show_common(kobj, attr, buf);
1425}
1426
1427static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
1428 struct kobj_attribute *attr, const char *buf, size_t len)
1429{
1430 return nr_hugepages_store_common(true, kobj, attr, buf, len);
1431}
1432HSTATE_ATTR(nr_hugepages_mempolicy);
1433#endif
1434
1435
1318static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 1436static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1319 struct kobj_attribute *attr, char *buf) 1437 struct kobj_attribute *attr, char *buf)
1320{ 1438{
1321 struct hstate *h = kobj_to_hstate(kobj); 1439 struct hstate *h = kobj_to_hstate(kobj, NULL);
1322 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 1440 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1323} 1441}
1324static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 1442static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
@@ -1326,7 +1444,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1326{ 1444{
1327 int err; 1445 int err;
1328 unsigned long input; 1446 unsigned long input;
1329 struct hstate *h = kobj_to_hstate(kobj); 1447 struct hstate *h = kobj_to_hstate(kobj, NULL);
1330 1448
1331 err = strict_strtoul(buf, 10, &input); 1449 err = strict_strtoul(buf, 10, &input);
1332 if (err) 1450 if (err)
@@ -1343,15 +1461,24 @@ HSTATE_ATTR(nr_overcommit_hugepages);
1343static ssize_t free_hugepages_show(struct kobject *kobj, 1461static ssize_t free_hugepages_show(struct kobject *kobj,
1344 struct kobj_attribute *attr, char *buf) 1462 struct kobj_attribute *attr, char *buf)
1345{ 1463{
1346 struct hstate *h = kobj_to_hstate(kobj); 1464 struct hstate *h;
1347 return sprintf(buf, "%lu\n", h->free_huge_pages); 1465 unsigned long free_huge_pages;
1466 int nid;
1467
1468 h = kobj_to_hstate(kobj, &nid);
1469 if (nid == NUMA_NO_NODE)
1470 free_huge_pages = h->free_huge_pages;
1471 else
1472 free_huge_pages = h->free_huge_pages_node[nid];
1473
1474 return sprintf(buf, "%lu\n", free_huge_pages);
1348} 1475}
1349HSTATE_ATTR_RO(free_hugepages); 1476HSTATE_ATTR_RO(free_hugepages);
1350 1477
1351static ssize_t resv_hugepages_show(struct kobject *kobj, 1478static ssize_t resv_hugepages_show(struct kobject *kobj,
1352 struct kobj_attribute *attr, char *buf) 1479 struct kobj_attribute *attr, char *buf)
1353{ 1480{
1354 struct hstate *h = kobj_to_hstate(kobj); 1481 struct hstate *h = kobj_to_hstate(kobj, NULL);
1355 return sprintf(buf, "%lu\n", h->resv_huge_pages); 1482 return sprintf(buf, "%lu\n", h->resv_huge_pages);
1356} 1483}
1357HSTATE_ATTR_RO(resv_hugepages); 1484HSTATE_ATTR_RO(resv_hugepages);
@@ -1359,8 +1486,17 @@ HSTATE_ATTR_RO(resv_hugepages);
1359static ssize_t surplus_hugepages_show(struct kobject *kobj, 1486static ssize_t surplus_hugepages_show(struct kobject *kobj,
1360 struct kobj_attribute *attr, char *buf) 1487 struct kobj_attribute *attr, char *buf)
1361{ 1488{
1362 struct hstate *h = kobj_to_hstate(kobj); 1489 struct hstate *h;
1363 return sprintf(buf, "%lu\n", h->surplus_huge_pages); 1490 unsigned long surplus_huge_pages;
1491 int nid;
1492
1493 h = kobj_to_hstate(kobj, &nid);
1494 if (nid == NUMA_NO_NODE)
1495 surplus_huge_pages = h->surplus_huge_pages;
1496 else
1497 surplus_huge_pages = h->surplus_huge_pages_node[nid];
1498
1499 return sprintf(buf, "%lu\n", surplus_huge_pages);
1364} 1500}
1365HSTATE_ATTR_RO(surplus_hugepages); 1501HSTATE_ATTR_RO(surplus_hugepages);
1366 1502
@@ -1370,6 +1506,9 @@ static struct attribute *hstate_attrs[] = {
1370 &free_hugepages_attr.attr, 1506 &free_hugepages_attr.attr,
1371 &resv_hugepages_attr.attr, 1507 &resv_hugepages_attr.attr,
1372 &surplus_hugepages_attr.attr, 1508 &surplus_hugepages_attr.attr,
1509#ifdef CONFIG_NUMA
1510 &nr_hugepages_mempolicy_attr.attr,
1511#endif
1373 NULL, 1512 NULL,
1374}; 1513};
1375 1514
@@ -1377,19 +1516,20 @@ static struct attribute_group hstate_attr_group = {
1377 .attrs = hstate_attrs, 1516 .attrs = hstate_attrs,
1378}; 1517};
1379 1518
1380static int __init hugetlb_sysfs_add_hstate(struct hstate *h) 1519static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1520 struct kobject **hstate_kobjs,
1521 struct attribute_group *hstate_attr_group)
1381{ 1522{
1382 int retval; 1523 int retval;
1524 int hi = h - hstates;
1383 1525
1384 hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, 1526 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1385 hugepages_kobj); 1527 if (!hstate_kobjs[hi])
1386 if (!hstate_kobjs[h - hstates])
1387 return -ENOMEM; 1528 return -ENOMEM;
1388 1529
1389 retval = sysfs_create_group(hstate_kobjs[h - hstates], 1530 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
1390 &hstate_attr_group);
1391 if (retval) 1531 if (retval)
1392 kobject_put(hstate_kobjs[h - hstates]); 1532 kobject_put(hstate_kobjs[hi]);
1393 1533
1394 return retval; 1534 return retval;
1395} 1535}
@@ -1404,17 +1544,184 @@ static void __init hugetlb_sysfs_init(void)
1404 return; 1544 return;
1405 1545
1406 for_each_hstate(h) { 1546 for_each_hstate(h) {
1407 err = hugetlb_sysfs_add_hstate(h); 1547 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
1548 hstate_kobjs, &hstate_attr_group);
1408 if (err) 1549 if (err)
1409 printk(KERN_ERR "Hugetlb: Unable to add hstate %s", 1550 printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1410 h->name); 1551 h->name);
1411 } 1552 }
1412} 1553}
1413 1554
1555#ifdef CONFIG_NUMA
1556
1557/*
1558 * node_hstate/s - associate per node hstate attributes, via their kobjects,
1559 * with node sysdevs in node_devices[] using a parallel array. The array
1560 * index of a node sysdev or _hstate == node id.
1561 * This is here to avoid any static dependency of the node sysdev driver, in
1562 * the base kernel, on the hugetlb module.
1563 */
1564struct node_hstate {
1565 struct kobject *hugepages_kobj;
1566 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1567};
1568struct node_hstate node_hstates[MAX_NUMNODES];
1569
1570/*
1571 * A subset of global hstate attributes for node sysdevs
1572 */
1573static struct attribute *per_node_hstate_attrs[] = {
1574 &nr_hugepages_attr.attr,
1575 &free_hugepages_attr.attr,
1576 &surplus_hugepages_attr.attr,
1577 NULL,
1578};
1579
1580static struct attribute_group per_node_hstate_attr_group = {
1581 .attrs = per_node_hstate_attrs,
1582};
1583
1584/*
1585 * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
1586 * Returns node id via non-NULL nidp.
1587 */
1588static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1589{
1590 int nid;
1591
1592 for (nid = 0; nid < nr_node_ids; nid++) {
1593 struct node_hstate *nhs = &node_hstates[nid];
1594 int i;
1595 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1596 if (nhs->hstate_kobjs[i] == kobj) {
1597 if (nidp)
1598 *nidp = nid;
1599 return &hstates[i];
1600 }
1601 }
1602
1603 BUG();
1604 return NULL;
1605}
1606
1607/*
1608 * Unregister hstate attributes from a single node sysdev.
1609 * No-op if no hstate attributes attached.
1610 */
1611void hugetlb_unregister_node(struct node *node)
1612{
1613 struct hstate *h;
1614 struct node_hstate *nhs = &node_hstates[node->sysdev.id];
1615
1616 if (!nhs->hugepages_kobj)
1617 return; /* no hstate attributes */
1618
1619 for_each_hstate(h)
1620 if (nhs->hstate_kobjs[h - hstates]) {
1621 kobject_put(nhs->hstate_kobjs[h - hstates]);
1622 nhs->hstate_kobjs[h - hstates] = NULL;
1623 }
1624
1625 kobject_put(nhs->hugepages_kobj);
1626 nhs->hugepages_kobj = NULL;
1627}
1628
1629/*
1630 * hugetlb module exit: unregister hstate attributes from node sysdevs
1631 * that have them.
1632 */
1633static void hugetlb_unregister_all_nodes(void)
1634{
1635 int nid;
1636
1637 /*
1638 * disable node sysdev registrations.
1639 */
1640 register_hugetlbfs_with_node(NULL, NULL);
1641
1642 /*
1643 * remove hstate attributes from any nodes that have them.
1644 */
1645 for (nid = 0; nid < nr_node_ids; nid++)
1646 hugetlb_unregister_node(&node_devices[nid]);
1647}
1648
1649/*
1650 * Register hstate attributes for a single node sysdev.
1651 * No-op if attributes already registered.
1652 */
1653void hugetlb_register_node(struct node *node)
1654{
1655 struct hstate *h;
1656 struct node_hstate *nhs = &node_hstates[node->sysdev.id];
1657 int err;
1658
1659 if (nhs->hugepages_kobj)
1660 return; /* already allocated */
1661
1662 nhs->hugepages_kobj = kobject_create_and_add("hugepages",
1663 &node->sysdev.kobj);
1664 if (!nhs->hugepages_kobj)
1665 return;
1666
1667 for_each_hstate(h) {
1668 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
1669 nhs->hstate_kobjs,
1670 &per_node_hstate_attr_group);
1671 if (err) {
1672 printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
1673 " for node %d\n",
1674 h->name, node->sysdev.id);
1675 hugetlb_unregister_node(node);
1676 break;
1677 }
1678 }
1679}
1680
1681/*
1682 * hugetlb init time: register hstate attributes for all registered node
1683 * sysdevs of nodes that have memory. All on-line nodes should have
1684 * registered their associated sysdev by this time.
1685 */
1686static void hugetlb_register_all_nodes(void)
1687{
1688 int nid;
1689
1690 for_each_node_state(nid, N_HIGH_MEMORY) {
1691 struct node *node = &node_devices[nid];
1692 if (node->sysdev.id == nid)
1693 hugetlb_register_node(node);
1694 }
1695
1696 /*
1697 * Let the node sysdev driver know we're here so it can
1698 * [un]register hstate attributes on node hotplug.
1699 */
1700 register_hugetlbfs_with_node(hugetlb_register_node,
1701 hugetlb_unregister_node);
1702}
1703#else /* !CONFIG_NUMA */
1704
1705static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1706{
1707 BUG();
1708 if (nidp)
1709 *nidp = -1;
1710 return NULL;
1711}
1712
1713static void hugetlb_unregister_all_nodes(void) { }
1714
1715static void hugetlb_register_all_nodes(void) { }
1716
1717#endif
1718
1414static void __exit hugetlb_exit(void) 1719static void __exit hugetlb_exit(void)
1415{ 1720{
1416 struct hstate *h; 1721 struct hstate *h;
1417 1722
1723 hugetlb_unregister_all_nodes();
1724
1418 for_each_hstate(h) { 1725 for_each_hstate(h) {
1419 kobject_put(hstate_kobjs[h - hstates]); 1726 kobject_put(hstate_kobjs[h - hstates]);
1420 } 1727 }
@@ -1449,6 +1756,8 @@ static int __init hugetlb_init(void)
1449 1756
1450 hugetlb_sysfs_init(); 1757 hugetlb_sysfs_init();
1451 1758
1759 hugetlb_register_all_nodes();
1760
1452 return 0; 1761 return 0;
1453} 1762}
1454module_init(hugetlb_init); 1763module_init(hugetlb_init);
@@ -1472,8 +1781,8 @@ void __init hugetlb_add_hstate(unsigned order)
1472 h->free_huge_pages = 0; 1781 h->free_huge_pages = 0;
1473 for (i = 0; i < MAX_NUMNODES; ++i) 1782 for (i = 0; i < MAX_NUMNODES; ++i)
1474 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1783 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1475 h->next_nid_to_alloc = first_node(node_online_map); 1784 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
1476 h->next_nid_to_free = first_node(node_online_map); 1785 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
1477 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1786 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1478 huge_page_size(h)/1024); 1787 huge_page_size(h)/1024);
1479 1788
@@ -1536,9 +1845,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
1536} 1845}
1537 1846
1538#ifdef CONFIG_SYSCTL 1847#ifdef CONFIG_SYSCTL
1539int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1848static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1540 void __user *buffer, 1849 struct ctl_table *table, int write,
1541 size_t *length, loff_t *ppos) 1850 void __user *buffer, size_t *length, loff_t *ppos)
1542{ 1851{
1543 struct hstate *h = &default_hstate; 1852 struct hstate *h = &default_hstate;
1544 unsigned long tmp; 1853 unsigned long tmp;
@@ -1550,12 +1859,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1550 table->maxlen = sizeof(unsigned long); 1859 table->maxlen = sizeof(unsigned long);
1551 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1860 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1552 1861
1553 if (write) 1862 if (write) {
1554 h->max_huge_pages = set_max_huge_pages(h, tmp); 1863 NODEMASK_ALLOC(nodemask_t, nodes_allowed,
1864 GFP_KERNEL | __GFP_NORETRY);
1865 if (!(obey_mempolicy &&
1866 init_nodemask_of_mempolicy(nodes_allowed))) {
1867 NODEMASK_FREE(nodes_allowed);
1868 nodes_allowed = &node_states[N_HIGH_MEMORY];
1869 }
1870 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
1871
1872 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1873 NODEMASK_FREE(nodes_allowed);
1874 }
1555 1875
1556 return 0; 1876 return 0;
1557} 1877}
1558 1878
1879int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1880 void __user *buffer, size_t *length, loff_t *ppos)
1881{
1882
1883 return hugetlb_sysctl_handler_common(false, table, write,
1884 buffer, length, ppos);
1885}
1886
1887#ifdef CONFIG_NUMA
1888int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
1889 void __user *buffer, size_t *length, loff_t *ppos)
1890{
1891 return hugetlb_sysctl_handler_common(true, table, write,
1892 buffer, length, ppos);
1893}
1894#endif /* CONFIG_NUMA */
1895
1559int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 1896int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1560 void __user *buffer, 1897 void __user *buffer,
1561 size_t *length, loff_t *ppos) 1898 size_t *length, loff_t *ppos)
@@ -1751,7 +2088,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
1751 2088
1752 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 2089 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
1753 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 2090 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
1754 update_mmu_cache(vma, address, entry); 2091 update_mmu_cache(vma, address, ptep);
1755 } 2092 }
1756} 2093}
1757 2094
@@ -1903,6 +2240,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
1903 + (vma->vm_pgoff >> PAGE_SHIFT); 2240 + (vma->vm_pgoff >> PAGE_SHIFT);
1904 mapping = (struct address_space *)page_private(page); 2241 mapping = (struct address_space *)page_private(page);
1905 2242
2243 /*
2244 * Take the mapping lock for the duration of the table walk. As
2245 * this mapping should be shared between all the VMAs,
2246 * __unmap_hugepage_range() is called as the lock is already held
2247 */
2248 spin_lock(&mapping->i_mmap_lock);
1906 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 2249 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1907 /* Do not unmap the current VMA */ 2250 /* Do not unmap the current VMA */
1908 if (iter_vma == vma) 2251 if (iter_vma == vma)
@@ -1916,10 +2259,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
1916 * from the time of fork. This would look like data corruption 2259 * from the time of fork. This would look like data corruption
1917 */ 2260 */
1918 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2261 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1919 unmap_hugepage_range(iter_vma, 2262 __unmap_hugepage_range(iter_vma,
1920 address, address + huge_page_size(h), 2263 address, address + huge_page_size(h),
1921 page); 2264 page);
1922 } 2265 }
2266 spin_unlock(&mapping->i_mmap_lock);
1923 2267
1924 return 1; 2268 return 1;
1925} 2269}
@@ -1959,6 +2303,9 @@ retry_avoidcopy:
1959 outside_reserve = 1; 2303 outside_reserve = 1;
1960 2304
1961 page_cache_get(old_page); 2305 page_cache_get(old_page);
2306
2307 /* Drop page_table_lock as buddy allocator may be called */
2308 spin_unlock(&mm->page_table_lock);
1962 new_page = alloc_huge_page(vma, address, outside_reserve); 2309 new_page = alloc_huge_page(vma, address, outside_reserve);
1963 2310
1964 if (IS_ERR(new_page)) { 2311 if (IS_ERR(new_page)) {
@@ -1976,19 +2323,25 @@ retry_avoidcopy:
1976 if (unmap_ref_private(mm, vma, old_page, address)) { 2323 if (unmap_ref_private(mm, vma, old_page, address)) {
1977 BUG_ON(page_count(old_page) != 1); 2324 BUG_ON(page_count(old_page) != 1);
1978 BUG_ON(huge_pte_none(pte)); 2325 BUG_ON(huge_pte_none(pte));
2326 spin_lock(&mm->page_table_lock);
1979 goto retry_avoidcopy; 2327 goto retry_avoidcopy;
1980 } 2328 }
1981 WARN_ON_ONCE(1); 2329 WARN_ON_ONCE(1);
1982 } 2330 }
1983 2331
2332 /* Caller expects lock to be held */
2333 spin_lock(&mm->page_table_lock);
1984 return -PTR_ERR(new_page); 2334 return -PTR_ERR(new_page);
1985 } 2335 }
1986 2336
1987 spin_unlock(&mm->page_table_lock);
1988 copy_huge_page(new_page, old_page, address, vma); 2337 copy_huge_page(new_page, old_page, address, vma);
1989 __SetPageUptodate(new_page); 2338 __SetPageUptodate(new_page);
1990 spin_lock(&mm->page_table_lock);
1991 2339
2340 /*
2341 * Retake the page_table_lock to check for racing updates
2342 * before the page tables are altered
2343 */
2344 spin_lock(&mm->page_table_lock);
1992 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2345 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
1993 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2346 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
1994 /* Break COW */ 2347 /* Break COW */
@@ -2095,8 +2448,10 @@ retry:
2095 spin_lock(&inode->i_lock); 2448 spin_lock(&inode->i_lock);
2096 inode->i_blocks += blocks_per_huge_page(h); 2449 inode->i_blocks += blocks_per_huge_page(h);
2097 spin_unlock(&inode->i_lock); 2450 spin_unlock(&inode->i_lock);
2098 } else 2451 } else {
2099 lock_page(page); 2452 lock_page(page);
2453 page->mapping = HUGETLB_POISON;
2454 }
2100 } 2455 }
2101 2456
2102 /* 2457 /*
@@ -2206,7 +2561,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2206 entry = pte_mkyoung(entry); 2561 entry = pte_mkyoung(entry);
2207 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 2562 if (huge_ptep_set_access_flags(vma, address, ptep, entry,
2208 flags & FAULT_FLAG_WRITE)) 2563 flags & FAULT_FLAG_WRITE))
2209 update_mmu_cache(vma, address, entry); 2564 update_mmu_cache(vma, address, ptep);
2210 2565
2211out_page_table_lock: 2566out_page_table_lock:
2212 spin_unlock(&mm->page_table_lock); 2567 spin_unlock(&mm->page_table_lock);