diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 266 |
1 files changed, 158 insertions, 108 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cafdcee154e8..5d7601b02874 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -234,6 +234,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) | |||
234 | 234 | ||
235 | return 1UL << (hstate->order + PAGE_SHIFT); | 235 | return 1UL << (hstate->order + PAGE_SHIFT); |
236 | } | 236 | } |
237 | EXPORT_SYMBOL_GPL(vma_kernel_pagesize); | ||
237 | 238 | ||
238 | /* | 239 | /* |
239 | * Return the page size being used by the MMU to back a VMA. In the majority | 240 | * Return the page size being used by the MMU to back a VMA. In the majority |
@@ -455,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) | |||
455 | h->free_huge_pages_node[nid]++; | 456 | h->free_huge_pages_node[nid]++; |
456 | } | 457 | } |
457 | 458 | ||
458 | static struct page *dequeue_huge_page(struct hstate *h) | ||
459 | { | ||
460 | int nid; | ||
461 | struct page *page = NULL; | ||
462 | |||
463 | for (nid = 0; nid < MAX_NUMNODES; ++nid) { | ||
464 | if (!list_empty(&h->hugepage_freelists[nid])) { | ||
465 | page = list_entry(h->hugepage_freelists[nid].next, | ||
466 | struct page, lru); | ||
467 | list_del(&page->lru); | ||
468 | h->free_huge_pages--; | ||
469 | h->free_huge_pages_node[nid]--; | ||
470 | break; | ||
471 | } | ||
472 | } | ||
473 | return page; | ||
474 | } | ||
475 | |||
476 | static struct page *dequeue_huge_page_vma(struct hstate *h, | 459 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
477 | struct vm_area_struct *vma, | 460 | struct vm_area_struct *vma, |
478 | unsigned long address, int avoid_reserve) | 461 | unsigned long address, int avoid_reserve) |
@@ -640,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
640 | 623 | ||
641 | /* | 624 | /* |
642 | * Use a helper variable to find the next node and then | 625 | * Use a helper variable to find the next node and then |
643 | * copy it back to hugetlb_next_nid afterwards: | 626 | * copy it back to next_nid_to_alloc afterwards: |
644 | * otherwise there's a window in which a racer might | 627 | * otherwise there's a window in which a racer might |
645 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. | 628 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. |
646 | * But we don't need to use a spin_lock here: it really | 629 | * But we don't need to use a spin_lock here: it really |
@@ -649,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
649 | * if we just successfully allocated a hugepage so that | 632 | * if we just successfully allocated a hugepage so that |
650 | * the next caller gets hugepages on the next node. | 633 | * the next caller gets hugepages on the next node. |
651 | */ | 634 | */ |
652 | static int hstate_next_node(struct hstate *h) | 635 | static int hstate_next_node_to_alloc(struct hstate *h) |
653 | { | 636 | { |
654 | int next_nid; | 637 | int next_nid; |
655 | next_nid = next_node(h->hugetlb_next_nid, node_online_map); | 638 | next_nid = next_node(h->next_nid_to_alloc, node_online_map); |
656 | if (next_nid == MAX_NUMNODES) | 639 | if (next_nid == MAX_NUMNODES) |
657 | next_nid = first_node(node_online_map); | 640 | next_nid = first_node(node_online_map); |
658 | h->hugetlb_next_nid = next_nid; | 641 | h->next_nid_to_alloc = next_nid; |
659 | return next_nid; | 642 | return next_nid; |
660 | } | 643 | } |
661 | 644 | ||
@@ -666,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
666 | int next_nid; | 649 | int next_nid; |
667 | int ret = 0; | 650 | int ret = 0; |
668 | 651 | ||
669 | start_nid = h->hugetlb_next_nid; | 652 | start_nid = h->next_nid_to_alloc; |
653 | next_nid = start_nid; | ||
670 | 654 | ||
671 | do { | 655 | do { |
672 | page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); | 656 | page = alloc_fresh_huge_page_node(h, next_nid); |
673 | if (page) | 657 | if (page) |
674 | ret = 1; | 658 | ret = 1; |
675 | next_nid = hstate_next_node(h); | 659 | next_nid = hstate_next_node_to_alloc(h); |
676 | } while (!page && h->hugetlb_next_nid != start_nid); | 660 | } while (!page && next_nid != start_nid); |
677 | 661 | ||
678 | if (ret) | 662 | if (ret) |
679 | count_vm_event(HTLB_BUDDY_PGALLOC); | 663 | count_vm_event(HTLB_BUDDY_PGALLOC); |
@@ -683,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
683 | return ret; | 667 | return ret; |
684 | } | 668 | } |
685 | 669 | ||
670 | /* | ||
671 | * helper for free_pool_huge_page() - find next node | ||
672 | * from which to free a huge page | ||
673 | */ | ||
674 | static int hstate_next_node_to_free(struct hstate *h) | ||
675 | { | ||
676 | int next_nid; | ||
677 | next_nid = next_node(h->next_nid_to_free, node_online_map); | ||
678 | if (next_nid == MAX_NUMNODES) | ||
679 | next_nid = first_node(node_online_map); | ||
680 | h->next_nid_to_free = next_nid; | ||
681 | return next_nid; | ||
682 | } | ||
683 | |||
684 | /* | ||
685 | * Free huge page from pool from next node to free. | ||
686 | * Attempt to keep persistent huge pages more or less | ||
687 | * balanced over allowed nodes. | ||
688 | * Called with hugetlb_lock locked. | ||
689 | */ | ||
690 | static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | ||
691 | { | ||
692 | int start_nid; | ||
693 | int next_nid; | ||
694 | int ret = 0; | ||
695 | |||
696 | start_nid = h->next_nid_to_free; | ||
697 | next_nid = start_nid; | ||
698 | |||
699 | do { | ||
700 | /* | ||
701 | * If we're returning unused surplus pages, only examine | ||
702 | * nodes with surplus pages. | ||
703 | */ | ||
704 | if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && | ||
705 | !list_empty(&h->hugepage_freelists[next_nid])) { | ||
706 | struct page *page = | ||
707 | list_entry(h->hugepage_freelists[next_nid].next, | ||
708 | struct page, lru); | ||
709 | list_del(&page->lru); | ||
710 | h->free_huge_pages--; | ||
711 | h->free_huge_pages_node[next_nid]--; | ||
712 | if (acct_surplus) { | ||
713 | h->surplus_huge_pages--; | ||
714 | h->surplus_huge_pages_node[next_nid]--; | ||
715 | } | ||
716 | update_and_free_page(h, page); | ||
717 | ret = 1; | ||
718 | } | ||
719 | next_nid = hstate_next_node_to_free(h); | ||
720 | } while (!ret && next_nid != start_nid); | ||
721 | |||
722 | return ret; | ||
723 | } | ||
724 | |||
686 | static struct page *alloc_buddy_huge_page(struct hstate *h, | 725 | static struct page *alloc_buddy_huge_page(struct hstate *h, |
687 | struct vm_area_struct *vma, unsigned long address) | 726 | struct vm_area_struct *vma, unsigned long address) |
688 | { | 727 | { |
@@ -854,22 +893,13 @@ free: | |||
854 | * When releasing a hugetlb pool reservation, any surplus pages that were | 893 | * When releasing a hugetlb pool reservation, any surplus pages that were |
855 | * allocated to satisfy the reservation must be explicitly freed if they were | 894 | * allocated to satisfy the reservation must be explicitly freed if they were |
856 | * never used. | 895 | * never used. |
896 | * Called with hugetlb_lock held. | ||
857 | */ | 897 | */ |
858 | static void return_unused_surplus_pages(struct hstate *h, | 898 | static void return_unused_surplus_pages(struct hstate *h, |
859 | unsigned long unused_resv_pages) | 899 | unsigned long unused_resv_pages) |
860 | { | 900 | { |
861 | static int nid = -1; | ||
862 | struct page *page; | ||
863 | unsigned long nr_pages; | 901 | unsigned long nr_pages; |
864 | 902 | ||
865 | /* | ||
866 | * We want to release as many surplus pages as possible, spread | ||
867 | * evenly across all nodes. Iterate across all nodes until we | ||
868 | * can no longer free unreserved surplus pages. This occurs when | ||
869 | * the nodes with surplus pages have no free pages. | ||
870 | */ | ||
871 | unsigned long remaining_iterations = nr_online_nodes; | ||
872 | |||
873 | /* Uncommit the reservation */ | 903 | /* Uncommit the reservation */ |
874 | h->resv_huge_pages -= unused_resv_pages; | 904 | h->resv_huge_pages -= unused_resv_pages; |
875 | 905 | ||
@@ -879,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
879 | 909 | ||
880 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); | 910 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); |
881 | 911 | ||
882 | while (remaining_iterations-- && nr_pages) { | 912 | /* |
883 | nid = next_node(nid, node_online_map); | 913 | * We want to release as many surplus pages as possible, spread |
884 | if (nid == MAX_NUMNODES) | 914 | * evenly across all nodes. Iterate across all nodes until we |
885 | nid = first_node(node_online_map); | 915 | * can no longer free unreserved surplus pages. This occurs when |
886 | 916 | * the nodes with surplus pages have no free pages. | |
887 | if (!h->surplus_huge_pages_node[nid]) | 917 | * free_pool_huge_page() will balance the the frees across the |
888 | continue; | 918 | * on-line nodes for us and will handle the hstate accounting. |
889 | 919 | */ | |
890 | if (!list_empty(&h->hugepage_freelists[nid])) { | 920 | while (nr_pages--) { |
891 | page = list_entry(h->hugepage_freelists[nid].next, | 921 | if (!free_pool_huge_page(h, 1)) |
892 | struct page, lru); | 922 | break; |
893 | list_del(&page->lru); | ||
894 | update_and_free_page(h, page); | ||
895 | h->free_huge_pages--; | ||
896 | h->free_huge_pages_node[nid]--; | ||
897 | h->surplus_huge_pages--; | ||
898 | h->surplus_huge_pages_node[nid]--; | ||
899 | nr_pages--; | ||
900 | remaining_iterations = nr_online_nodes; | ||
901 | } | ||
902 | } | 923 | } |
903 | } | 924 | } |
904 | 925 | ||
@@ -1007,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
1007 | void *addr; | 1028 | void *addr; |
1008 | 1029 | ||
1009 | addr = __alloc_bootmem_node_nopanic( | 1030 | addr = __alloc_bootmem_node_nopanic( |
1010 | NODE_DATA(h->hugetlb_next_nid), | 1031 | NODE_DATA(h->next_nid_to_alloc), |
1011 | huge_page_size(h), huge_page_size(h), 0); | 1032 | huge_page_size(h), huge_page_size(h), 0); |
1012 | 1033 | ||
1034 | hstate_next_node_to_alloc(h); | ||
1013 | if (addr) { | 1035 | if (addr) { |
1014 | /* | 1036 | /* |
1015 | * Use the beginning of the huge page to store the | 1037 | * Use the beginning of the huge page to store the |
@@ -1019,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
1019 | m = addr; | 1041 | m = addr; |
1020 | goto found; | 1042 | goto found; |
1021 | } | 1043 | } |
1022 | hstate_next_node(h); | ||
1023 | nr_nodes--; | 1044 | nr_nodes--; |
1024 | } | 1045 | } |
1025 | return 0; | 1046 | return 0; |
@@ -1140,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
1140 | */ | 1161 | */ |
1141 | static int adjust_pool_surplus(struct hstate *h, int delta) | 1162 | static int adjust_pool_surplus(struct hstate *h, int delta) |
1142 | { | 1163 | { |
1143 | static int prev_nid; | 1164 | int start_nid, next_nid; |
1144 | int nid = prev_nid; | ||
1145 | int ret = 0; | 1165 | int ret = 0; |
1146 | 1166 | ||
1147 | VM_BUG_ON(delta != -1 && delta != 1); | 1167 | VM_BUG_ON(delta != -1 && delta != 1); |
1148 | do { | ||
1149 | nid = next_node(nid, node_online_map); | ||
1150 | if (nid == MAX_NUMNODES) | ||
1151 | nid = first_node(node_online_map); | ||
1152 | 1168 | ||
1153 | /* To shrink on this node, there must be a surplus page */ | 1169 | if (delta < 0) |
1154 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) | 1170 | start_nid = h->next_nid_to_alloc; |
1155 | continue; | 1171 | else |
1156 | /* Surplus cannot exceed the total number of pages */ | 1172 | start_nid = h->next_nid_to_free; |
1157 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= | 1173 | next_nid = start_nid; |
1174 | |||
1175 | do { | ||
1176 | int nid = next_nid; | ||
1177 | if (delta < 0) { | ||
1178 | next_nid = hstate_next_node_to_alloc(h); | ||
1179 | /* | ||
1180 | * To shrink on this node, there must be a surplus page | ||
1181 | */ | ||
1182 | if (!h->surplus_huge_pages_node[nid]) | ||
1183 | continue; | ||
1184 | } | ||
1185 | if (delta > 0) { | ||
1186 | next_nid = hstate_next_node_to_free(h); | ||
1187 | /* | ||
1188 | * Surplus cannot exceed the total number of pages | ||
1189 | */ | ||
1190 | if (h->surplus_huge_pages_node[nid] >= | ||
1158 | h->nr_huge_pages_node[nid]) | 1191 | h->nr_huge_pages_node[nid]) |
1159 | continue; | 1192 | continue; |
1193 | } | ||
1160 | 1194 | ||
1161 | h->surplus_huge_pages += delta; | 1195 | h->surplus_huge_pages += delta; |
1162 | h->surplus_huge_pages_node[nid] += delta; | 1196 | h->surplus_huge_pages_node[nid] += delta; |
1163 | ret = 1; | 1197 | ret = 1; |
1164 | break; | 1198 | break; |
1165 | } while (nid != prev_nid); | 1199 | } while (next_nid != start_nid); |
1166 | 1200 | ||
1167 | prev_nid = nid; | ||
1168 | return ret; | 1201 | return ret; |
1169 | } | 1202 | } |
1170 | 1203 | ||
@@ -1226,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1226 | min_count = max(count, min_count); | 1259 | min_count = max(count, min_count); |
1227 | try_to_free_low(h, min_count); | 1260 | try_to_free_low(h, min_count); |
1228 | while (min_count < persistent_huge_pages(h)) { | 1261 | while (min_count < persistent_huge_pages(h)) { |
1229 | struct page *page = dequeue_huge_page(h); | 1262 | if (!free_pool_huge_page(h, 0)) |
1230 | if (!page) | ||
1231 | break; | 1263 | break; |
1232 | update_and_free_page(h, page); | ||
1233 | } | 1264 | } |
1234 | while (count < persistent_huge_pages(h)) { | 1265 | while (count < persistent_huge_pages(h)) { |
1235 | if (!adjust_pool_surplus(h, 1)) | 1266 | if (!adjust_pool_surplus(h, 1)) |
@@ -1441,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1441 | h->free_huge_pages = 0; | 1472 | h->free_huge_pages = 0; |
1442 | for (i = 0; i < MAX_NUMNODES; ++i) | 1473 | for (i = 0; i < MAX_NUMNODES; ++i) |
1443 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1474 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1444 | h->hugetlb_next_nid = first_node(node_online_map); | 1475 | h->next_nid_to_alloc = first_node(node_online_map); |
1476 | h->next_nid_to_free = first_node(node_online_map); | ||
1445 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1477 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1446 | huge_page_size(h)/1024); | 1478 | huge_page_size(h)/1024); |
1447 | 1479 | ||
@@ -1505,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
1505 | 1537 | ||
1506 | #ifdef CONFIG_SYSCTL | 1538 | #ifdef CONFIG_SYSCTL |
1507 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1539 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
1508 | struct file *file, void __user *buffer, | 1540 | void __user *buffer, |
1509 | size_t *length, loff_t *ppos) | 1541 | size_t *length, loff_t *ppos) |
1510 | { | 1542 | { |
1511 | struct hstate *h = &default_hstate; | 1543 | struct hstate *h = &default_hstate; |
@@ -1516,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, | |||
1516 | 1548 | ||
1517 | table->data = &tmp; | 1549 | table->data = &tmp; |
1518 | table->maxlen = sizeof(unsigned long); | 1550 | table->maxlen = sizeof(unsigned long); |
1519 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | 1551 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1520 | 1552 | ||
1521 | if (write) | 1553 | if (write) |
1522 | h->max_huge_pages = set_max_huge_pages(h, tmp); | 1554 | h->max_huge_pages = set_max_huge_pages(h, tmp); |
@@ -1525,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, | |||
1525 | } | 1557 | } |
1526 | 1558 | ||
1527 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, | 1559 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, |
1528 | struct file *file, void __user *buffer, | 1560 | void __user *buffer, |
1529 | size_t *length, loff_t *ppos) | 1561 | size_t *length, loff_t *ppos) |
1530 | { | 1562 | { |
1531 | proc_dointvec(table, write, file, buffer, length, ppos); | 1563 | proc_dointvec(table, write, buffer, length, ppos); |
1532 | if (hugepages_treat_as_movable) | 1564 | if (hugepages_treat_as_movable) |
1533 | htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; | 1565 | htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; |
1534 | else | 1566 | else |
@@ -1537,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write, | |||
1537 | } | 1569 | } |
1538 | 1570 | ||
1539 | int hugetlb_overcommit_handler(struct ctl_table *table, int write, | 1571 | int hugetlb_overcommit_handler(struct ctl_table *table, int write, |
1540 | struct file *file, void __user *buffer, | 1572 | void __user *buffer, |
1541 | size_t *length, loff_t *ppos) | 1573 | size_t *length, loff_t *ppos) |
1542 | { | 1574 | { |
1543 | struct hstate *h = &default_hstate; | 1575 | struct hstate *h = &default_hstate; |
@@ -1548,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
1548 | 1580 | ||
1549 | table->data = &tmp; | 1581 | table->data = &tmp; |
1550 | table->maxlen = sizeof(unsigned long); | 1582 | table->maxlen = sizeof(unsigned long); |
1551 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | 1583 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1552 | 1584 | ||
1553 | if (write) { | 1585 | if (write) { |
1554 | spin_lock(&hugetlb_lock); | 1586 | spin_lock(&hugetlb_lock); |
@@ -1689,7 +1721,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1689 | return 0; | 1721 | return 0; |
1690 | } | 1722 | } |
1691 | 1723 | ||
1692 | struct vm_operations_struct hugetlb_vm_ops = { | 1724 | const struct vm_operations_struct hugetlb_vm_ops = { |
1693 | .fault = hugetlb_vm_op_fault, | 1725 | .fault = hugetlb_vm_op_fault, |
1694 | .open = hugetlb_vm_op_open, | 1726 | .open = hugetlb_vm_op_open, |
1695 | .close = hugetlb_vm_op_close, | 1727 | .close = hugetlb_vm_op_close, |
@@ -1984,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h, | |||
1984 | return find_lock_page(mapping, idx); | 2016 | return find_lock_page(mapping, idx); |
1985 | } | 2017 | } |
1986 | 2018 | ||
2019 | /* | ||
2020 | * Return whether there is a pagecache page to back given address within VMA. | ||
2021 | * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. | ||
2022 | */ | ||
2023 | static bool hugetlbfs_pagecache_present(struct hstate *h, | ||
2024 | struct vm_area_struct *vma, unsigned long address) | ||
2025 | { | ||
2026 | struct address_space *mapping; | ||
2027 | pgoff_t idx; | ||
2028 | struct page *page; | ||
2029 | |||
2030 | mapping = vma->vm_file->f_mapping; | ||
2031 | idx = vma_hugecache_offset(h, vma, address); | ||
2032 | |||
2033 | page = find_get_page(mapping, idx); | ||
2034 | if (page) | ||
2035 | put_page(page); | ||
2036 | return page != NULL; | ||
2037 | } | ||
2038 | |||
1987 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2039 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1988 | unsigned long address, pte_t *ptep, unsigned int flags) | 2040 | unsigned long address, pte_t *ptep, unsigned int flags) |
1989 | { | 2041 | { |
@@ -2179,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, | |||
2179 | return NULL; | 2231 | return NULL; |
2180 | } | 2232 | } |
2181 | 2233 | ||
2182 | static int huge_zeropage_ok(pte_t *ptep, int write, int shared) | ||
2183 | { | ||
2184 | if (!ptep || write || shared) | ||
2185 | return 0; | ||
2186 | else | ||
2187 | return huge_pte_none(huge_ptep_get(ptep)); | ||
2188 | } | ||
2189 | |||
2190 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2234 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
2191 | struct page **pages, struct vm_area_struct **vmas, | 2235 | struct page **pages, struct vm_area_struct **vmas, |
2192 | unsigned long *position, int *length, int i, | 2236 | unsigned long *position, int *length, int i, |
2193 | int write) | 2237 | unsigned int flags) |
2194 | { | 2238 | { |
2195 | unsigned long pfn_offset; | 2239 | unsigned long pfn_offset; |
2196 | unsigned long vaddr = *position; | 2240 | unsigned long vaddr = *position; |
2197 | int remainder = *length; | 2241 | int remainder = *length; |
2198 | struct hstate *h = hstate_vma(vma); | 2242 | struct hstate *h = hstate_vma(vma); |
2199 | int zeropage_ok = 0; | ||
2200 | int shared = vma->vm_flags & VM_SHARED; | ||
2201 | 2243 | ||
2202 | spin_lock(&mm->page_table_lock); | 2244 | spin_lock(&mm->page_table_lock); |
2203 | while (vaddr < vma->vm_end && remainder) { | 2245 | while (vaddr < vma->vm_end && remainder) { |
2204 | pte_t *pte; | 2246 | pte_t *pte; |
2247 | int absent; | ||
2205 | struct page *page; | 2248 | struct page *page; |
2206 | 2249 | ||
2207 | /* | 2250 | /* |
2208 | * Some archs (sparc64, sh*) have multiple pte_ts to | 2251 | * Some archs (sparc64, sh*) have multiple pte_ts to |
2209 | * each hugepage. We have to make * sure we get the | 2252 | * each hugepage. We have to make sure we get the |
2210 | * first, for the page indexing below to work. | 2253 | * first, for the page indexing below to work. |
2211 | */ | 2254 | */ |
2212 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); | 2255 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); |
2213 | if (huge_zeropage_ok(pte, write, shared)) | 2256 | absent = !pte || huge_pte_none(huge_ptep_get(pte)); |
2214 | zeropage_ok = 1; | 2257 | |
2258 | /* | ||
2259 | * When coredumping, it suits get_dump_page if we just return | ||
2260 | * an error where there's an empty slot with no huge pagecache | ||
2261 | * to back it. This way, we avoid allocating a hugepage, and | ||
2262 | * the sparse dumpfile avoids allocating disk blocks, but its | ||
2263 | * huge holes still show up with zeroes where they need to be. | ||
2264 | */ | ||
2265 | if (absent && (flags & FOLL_DUMP) && | ||
2266 | !hugetlbfs_pagecache_present(h, vma, vaddr)) { | ||
2267 | remainder = 0; | ||
2268 | break; | ||
2269 | } | ||
2215 | 2270 | ||
2216 | if (!pte || | 2271 | if (absent || |
2217 | (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || | 2272 | ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { |
2218 | (write && !pte_write(huge_ptep_get(pte)))) { | ||
2219 | int ret; | 2273 | int ret; |
2220 | 2274 | ||
2221 | spin_unlock(&mm->page_table_lock); | 2275 | spin_unlock(&mm->page_table_lock); |
2222 | ret = hugetlb_fault(mm, vma, vaddr, write); | 2276 | ret = hugetlb_fault(mm, vma, vaddr, |
2277 | (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); | ||
2223 | spin_lock(&mm->page_table_lock); | 2278 | spin_lock(&mm->page_table_lock); |
2224 | if (!(ret & VM_FAULT_ERROR)) | 2279 | if (!(ret & VM_FAULT_ERROR)) |
2225 | continue; | 2280 | continue; |
2226 | 2281 | ||
2227 | remainder = 0; | 2282 | remainder = 0; |
2228 | if (!i) | ||
2229 | i = -EFAULT; | ||
2230 | break; | 2283 | break; |
2231 | } | 2284 | } |
2232 | 2285 | ||
@@ -2234,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2234 | page = pte_page(huge_ptep_get(pte)); | 2287 | page = pte_page(huge_ptep_get(pte)); |
2235 | same_page: | 2288 | same_page: |
2236 | if (pages) { | 2289 | if (pages) { |
2237 | if (zeropage_ok) | 2290 | pages[i] = mem_map_offset(page, pfn_offset); |
2238 | pages[i] = ZERO_PAGE(0); | ||
2239 | else | ||
2240 | pages[i] = mem_map_offset(page, pfn_offset); | ||
2241 | get_page(pages[i]); | 2291 | get_page(pages[i]); |
2242 | } | 2292 | } |
2243 | 2293 | ||
@@ -2261,7 +2311,7 @@ same_page: | |||
2261 | *length = remainder; | 2311 | *length = remainder; |
2262 | *position = vaddr; | 2312 | *position = vaddr; |
2263 | 2313 | ||
2264 | return i; | 2314 | return i ? i : -EFAULT; |
2265 | } | 2315 | } |
2266 | 2316 | ||
2267 | void hugetlb_change_protection(struct vm_area_struct *vma, | 2317 | void hugetlb_change_protection(struct vm_area_struct *vma, |