diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 251 |
1 files changed, 150 insertions, 101 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b16d63634777..815dbd4a6dcb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -456,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) | |||
456 | h->free_huge_pages_node[nid]++; | 456 | h->free_huge_pages_node[nid]++; |
457 | } | 457 | } |
458 | 458 | ||
459 | static struct page *dequeue_huge_page(struct hstate *h) | ||
460 | { | ||
461 | int nid; | ||
462 | struct page *page = NULL; | ||
463 | |||
464 | for (nid = 0; nid < MAX_NUMNODES; ++nid) { | ||
465 | if (!list_empty(&h->hugepage_freelists[nid])) { | ||
466 | page = list_entry(h->hugepage_freelists[nid].next, | ||
467 | struct page, lru); | ||
468 | list_del(&page->lru); | ||
469 | h->free_huge_pages--; | ||
470 | h->free_huge_pages_node[nid]--; | ||
471 | break; | ||
472 | } | ||
473 | } | ||
474 | return page; | ||
475 | } | ||
476 | |||
477 | static struct page *dequeue_huge_page_vma(struct hstate *h, | 459 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
478 | struct vm_area_struct *vma, | 460 | struct vm_area_struct *vma, |
479 | unsigned long address, int avoid_reserve) | 461 | unsigned long address, int avoid_reserve) |
@@ -641,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
641 | 623 | ||
642 | /* | 624 | /* |
643 | * Use a helper variable to find the next node and then | 625 | * Use a helper variable to find the next node and then |
644 | * copy it back to hugetlb_next_nid afterwards: | 626 | * copy it back to next_nid_to_alloc afterwards: |
645 | * otherwise there's a window in which a racer might | 627 | * otherwise there's a window in which a racer might |
646 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. | 628 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. |
647 | * But we don't need to use a spin_lock here: it really | 629 | * But we don't need to use a spin_lock here: it really |
@@ -650,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
650 | * if we just successfully allocated a hugepage so that | 632 | * if we just successfully allocated a hugepage so that |
651 | * the next caller gets hugepages on the next node. | 633 | * the next caller gets hugepages on the next node. |
652 | */ | 634 | */ |
653 | static int hstate_next_node(struct hstate *h) | 635 | static int hstate_next_node_to_alloc(struct hstate *h) |
654 | { | 636 | { |
655 | int next_nid; | 637 | int next_nid; |
656 | next_nid = next_node(h->hugetlb_next_nid, node_online_map); | 638 | next_nid = next_node(h->next_nid_to_alloc, node_online_map); |
657 | if (next_nid == MAX_NUMNODES) | 639 | if (next_nid == MAX_NUMNODES) |
658 | next_nid = first_node(node_online_map); | 640 | next_nid = first_node(node_online_map); |
659 | h->hugetlb_next_nid = next_nid; | 641 | h->next_nid_to_alloc = next_nid; |
660 | return next_nid; | 642 | return next_nid; |
661 | } | 643 | } |
662 | 644 | ||
@@ -667,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
667 | int next_nid; | 649 | int next_nid; |
668 | int ret = 0; | 650 | int ret = 0; |
669 | 651 | ||
670 | start_nid = h->hugetlb_next_nid; | 652 | start_nid = h->next_nid_to_alloc; |
653 | next_nid = start_nid; | ||
671 | 654 | ||
672 | do { | 655 | do { |
673 | page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); | 656 | page = alloc_fresh_huge_page_node(h, next_nid); |
674 | if (page) | 657 | if (page) |
675 | ret = 1; | 658 | ret = 1; |
676 | next_nid = hstate_next_node(h); | 659 | next_nid = hstate_next_node_to_alloc(h); |
677 | } while (!page && h->hugetlb_next_nid != start_nid); | 660 | } while (!page && next_nid != start_nid); |
678 | 661 | ||
679 | if (ret) | 662 | if (ret) |
680 | count_vm_event(HTLB_BUDDY_PGALLOC); | 663 | count_vm_event(HTLB_BUDDY_PGALLOC); |
@@ -684,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
684 | return ret; | 667 | return ret; |
685 | } | 668 | } |
686 | 669 | ||
670 | /* | ||
671 | * helper for free_pool_huge_page() - find next node | ||
672 | * from which to free a huge page | ||
673 | */ | ||
674 | static int hstate_next_node_to_free(struct hstate *h) | ||
675 | { | ||
676 | int next_nid; | ||
677 | next_nid = next_node(h->next_nid_to_free, node_online_map); | ||
678 | if (next_nid == MAX_NUMNODES) | ||
679 | next_nid = first_node(node_online_map); | ||
680 | h->next_nid_to_free = next_nid; | ||
681 | return next_nid; | ||
682 | } | ||
683 | |||
684 | /* | ||
685 | * Free huge page from pool from next node to free. | ||
686 | * Attempt to keep persistent huge pages more or less | ||
687 | * balanced over allowed nodes. | ||
688 | * Called with hugetlb_lock locked. | ||
689 | */ | ||
690 | static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | ||
691 | { | ||
692 | int start_nid; | ||
693 | int next_nid; | ||
694 | int ret = 0; | ||
695 | |||
696 | start_nid = h->next_nid_to_free; | ||
697 | next_nid = start_nid; | ||
698 | |||
699 | do { | ||
700 | /* | ||
701 | * If we're returning unused surplus pages, only examine | ||
702 | * nodes with surplus pages. | ||
703 | */ | ||
704 | if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && | ||
705 | !list_empty(&h->hugepage_freelists[next_nid])) { | ||
706 | struct page *page = | ||
707 | list_entry(h->hugepage_freelists[next_nid].next, | ||
708 | struct page, lru); | ||
709 | list_del(&page->lru); | ||
710 | h->free_huge_pages--; | ||
711 | h->free_huge_pages_node[next_nid]--; | ||
712 | if (acct_surplus) { | ||
713 | h->surplus_huge_pages--; | ||
714 | h->surplus_huge_pages_node[next_nid]--; | ||
715 | } | ||
716 | update_and_free_page(h, page); | ||
717 | ret = 1; | ||
718 | } | ||
719 | next_nid = hstate_next_node_to_free(h); | ||
720 | } while (!ret && next_nid != start_nid); | ||
721 | |||
722 | return ret; | ||
723 | } | ||
724 | |||
687 | static struct page *alloc_buddy_huge_page(struct hstate *h, | 725 | static struct page *alloc_buddy_huge_page(struct hstate *h, |
688 | struct vm_area_struct *vma, unsigned long address) | 726 | struct vm_area_struct *vma, unsigned long address) |
689 | { | 727 | { |
@@ -855,22 +893,13 @@ free: | |||
855 | * When releasing a hugetlb pool reservation, any surplus pages that were | 893 | * When releasing a hugetlb pool reservation, any surplus pages that were |
856 | * allocated to satisfy the reservation must be explicitly freed if they were | 894 | * allocated to satisfy the reservation must be explicitly freed if they were |
857 | * never used. | 895 | * never used. |
896 | * Called with hugetlb_lock held. | ||
858 | */ | 897 | */ |
859 | static void return_unused_surplus_pages(struct hstate *h, | 898 | static void return_unused_surplus_pages(struct hstate *h, |
860 | unsigned long unused_resv_pages) | 899 | unsigned long unused_resv_pages) |
861 | { | 900 | { |
862 | static int nid = -1; | ||
863 | struct page *page; | ||
864 | unsigned long nr_pages; | 901 | unsigned long nr_pages; |
865 | 902 | ||
866 | /* | ||
867 | * We want to release as many surplus pages as possible, spread | ||
868 | * evenly across all nodes. Iterate across all nodes until we | ||
869 | * can no longer free unreserved surplus pages. This occurs when | ||
870 | * the nodes with surplus pages have no free pages. | ||
871 | */ | ||
872 | unsigned long remaining_iterations = nr_online_nodes; | ||
873 | |||
874 | /* Uncommit the reservation */ | 903 | /* Uncommit the reservation */ |
875 | h->resv_huge_pages -= unused_resv_pages; | 904 | h->resv_huge_pages -= unused_resv_pages; |
876 | 905 | ||
@@ -880,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
880 | 909 | ||
881 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); | 910 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); |
882 | 911 | ||
883 | while (remaining_iterations-- && nr_pages) { | 912 | /* |
884 | nid = next_node(nid, node_online_map); | 913 | * We want to release as many surplus pages as possible, spread |
885 | if (nid == MAX_NUMNODES) | 914 | * evenly across all nodes. Iterate across all nodes until we |
886 | nid = first_node(node_online_map); | 915 | * can no longer free unreserved surplus pages. This occurs when |
887 | 916 | * the nodes with surplus pages have no free pages. | |
888 | if (!h->surplus_huge_pages_node[nid]) | 917 | * free_pool_huge_page() will balance the the frees across the |
889 | continue; | 918 | * on-line nodes for us and will handle the hstate accounting. |
890 | 919 | */ | |
891 | if (!list_empty(&h->hugepage_freelists[nid])) { | 920 | while (nr_pages--) { |
892 | page = list_entry(h->hugepage_freelists[nid].next, | 921 | if (!free_pool_huge_page(h, 1)) |
893 | struct page, lru); | 922 | break; |
894 | list_del(&page->lru); | ||
895 | update_and_free_page(h, page); | ||
896 | h->free_huge_pages--; | ||
897 | h->free_huge_pages_node[nid]--; | ||
898 | h->surplus_huge_pages--; | ||
899 | h->surplus_huge_pages_node[nid]--; | ||
900 | nr_pages--; | ||
901 | remaining_iterations = nr_online_nodes; | ||
902 | } | ||
903 | } | 923 | } |
904 | } | 924 | } |
905 | 925 | ||
@@ -1008,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
1008 | void *addr; | 1028 | void *addr; |
1009 | 1029 | ||
1010 | addr = __alloc_bootmem_node_nopanic( | 1030 | addr = __alloc_bootmem_node_nopanic( |
1011 | NODE_DATA(h->hugetlb_next_nid), | 1031 | NODE_DATA(h->next_nid_to_alloc), |
1012 | huge_page_size(h), huge_page_size(h), 0); | 1032 | huge_page_size(h), huge_page_size(h), 0); |
1013 | 1033 | ||
1034 | hstate_next_node_to_alloc(h); | ||
1014 | if (addr) { | 1035 | if (addr) { |
1015 | /* | 1036 | /* |
1016 | * Use the beginning of the huge page to store the | 1037 | * Use the beginning of the huge page to store the |
@@ -1020,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
1020 | m = addr; | 1041 | m = addr; |
1021 | goto found; | 1042 | goto found; |
1022 | } | 1043 | } |
1023 | hstate_next_node(h); | ||
1024 | nr_nodes--; | 1044 | nr_nodes--; |
1025 | } | 1045 | } |
1026 | return 0; | 1046 | return 0; |
@@ -1141,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
1141 | */ | 1161 | */ |
1142 | static int adjust_pool_surplus(struct hstate *h, int delta) | 1162 | static int adjust_pool_surplus(struct hstate *h, int delta) |
1143 | { | 1163 | { |
1144 | static int prev_nid; | 1164 | int start_nid, next_nid; |
1145 | int nid = prev_nid; | ||
1146 | int ret = 0; | 1165 | int ret = 0; |
1147 | 1166 | ||
1148 | VM_BUG_ON(delta != -1 && delta != 1); | 1167 | VM_BUG_ON(delta != -1 && delta != 1); |
1149 | do { | ||
1150 | nid = next_node(nid, node_online_map); | ||
1151 | if (nid == MAX_NUMNODES) | ||
1152 | nid = first_node(node_online_map); | ||
1153 | 1168 | ||
1154 | /* To shrink on this node, there must be a surplus page */ | 1169 | if (delta < 0) |
1155 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) | 1170 | start_nid = h->next_nid_to_alloc; |
1156 | continue; | 1171 | else |
1157 | /* Surplus cannot exceed the total number of pages */ | 1172 | start_nid = h->next_nid_to_free; |
1158 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= | 1173 | next_nid = start_nid; |
1174 | |||
1175 | do { | ||
1176 | int nid = next_nid; | ||
1177 | if (delta < 0) { | ||
1178 | next_nid = hstate_next_node_to_alloc(h); | ||
1179 | /* | ||
1180 | * To shrink on this node, there must be a surplus page | ||
1181 | */ | ||
1182 | if (!h->surplus_huge_pages_node[nid]) | ||
1183 | continue; | ||
1184 | } | ||
1185 | if (delta > 0) { | ||
1186 | next_nid = hstate_next_node_to_free(h); | ||
1187 | /* | ||
1188 | * Surplus cannot exceed the total number of pages | ||
1189 | */ | ||
1190 | if (h->surplus_huge_pages_node[nid] >= | ||
1159 | h->nr_huge_pages_node[nid]) | 1191 | h->nr_huge_pages_node[nid]) |
1160 | continue; | 1192 | continue; |
1193 | } | ||
1161 | 1194 | ||
1162 | h->surplus_huge_pages += delta; | 1195 | h->surplus_huge_pages += delta; |
1163 | h->surplus_huge_pages_node[nid] += delta; | 1196 | h->surplus_huge_pages_node[nid] += delta; |
1164 | ret = 1; | 1197 | ret = 1; |
1165 | break; | 1198 | break; |
1166 | } while (nid != prev_nid); | 1199 | } while (next_nid != start_nid); |
1167 | 1200 | ||
1168 | prev_nid = nid; | ||
1169 | return ret; | 1201 | return ret; |
1170 | } | 1202 | } |
1171 | 1203 | ||
@@ -1227,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
1227 | min_count = max(count, min_count); | 1259 | min_count = max(count, min_count); |
1228 | try_to_free_low(h, min_count); | 1260 | try_to_free_low(h, min_count); |
1229 | while (min_count < persistent_huge_pages(h)) { | 1261 | while (min_count < persistent_huge_pages(h)) { |
1230 | struct page *page = dequeue_huge_page(h); | 1262 | if (!free_pool_huge_page(h, 0)) |
1231 | if (!page) | ||
1232 | break; | 1263 | break; |
1233 | update_and_free_page(h, page); | ||
1234 | } | 1264 | } |
1235 | while (count < persistent_huge_pages(h)) { | 1265 | while (count < persistent_huge_pages(h)) { |
1236 | if (!adjust_pool_surplus(h, 1)) | 1266 | if (!adjust_pool_surplus(h, 1)) |
@@ -1442,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1442 | h->free_huge_pages = 0; | 1472 | h->free_huge_pages = 0; |
1443 | for (i = 0; i < MAX_NUMNODES; ++i) | 1473 | for (i = 0; i < MAX_NUMNODES; ++i) |
1444 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1474 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1445 | h->hugetlb_next_nid = first_node(node_online_map); | 1475 | h->next_nid_to_alloc = first_node(node_online_map); |
1476 | h->next_nid_to_free = first_node(node_online_map); | ||
1446 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1477 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1447 | huge_page_size(h)/1024); | 1478 | huge_page_size(h)/1024); |
1448 | 1479 | ||
@@ -1985,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h, | |||
1985 | return find_lock_page(mapping, idx); | 2016 | return find_lock_page(mapping, idx); |
1986 | } | 2017 | } |
1987 | 2018 | ||
2019 | /* | ||
2020 | * Return whether there is a pagecache page to back given address within VMA. | ||
2021 | * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. | ||
2022 | */ | ||
2023 | static bool hugetlbfs_pagecache_present(struct hstate *h, | ||
2024 | struct vm_area_struct *vma, unsigned long address) | ||
2025 | { | ||
2026 | struct address_space *mapping; | ||
2027 | pgoff_t idx; | ||
2028 | struct page *page; | ||
2029 | |||
2030 | mapping = vma->vm_file->f_mapping; | ||
2031 | idx = vma_hugecache_offset(h, vma, address); | ||
2032 | |||
2033 | page = find_get_page(mapping, idx); | ||
2034 | if (page) | ||
2035 | put_page(page); | ||
2036 | return page != NULL; | ||
2037 | } | ||
2038 | |||
1988 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2039 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1989 | unsigned long address, pte_t *ptep, unsigned int flags) | 2040 | unsigned long address, pte_t *ptep, unsigned int flags) |
1990 | { | 2041 | { |
@@ -2180,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, | |||
2180 | return NULL; | 2231 | return NULL; |
2181 | } | 2232 | } |
2182 | 2233 | ||
2183 | static int huge_zeropage_ok(pte_t *ptep, int write, int shared) | ||
2184 | { | ||
2185 | if (!ptep || write || shared) | ||
2186 | return 0; | ||
2187 | else | ||
2188 | return huge_pte_none(huge_ptep_get(ptep)); | ||
2189 | } | ||
2190 | |||
2191 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2234 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
2192 | struct page **pages, struct vm_area_struct **vmas, | 2235 | struct page **pages, struct vm_area_struct **vmas, |
2193 | unsigned long *position, int *length, int i, | 2236 | unsigned long *position, int *length, int i, |
2194 | int write) | 2237 | unsigned int flags) |
2195 | { | 2238 | { |
2196 | unsigned long pfn_offset; | 2239 | unsigned long pfn_offset; |
2197 | unsigned long vaddr = *position; | 2240 | unsigned long vaddr = *position; |
2198 | int remainder = *length; | 2241 | int remainder = *length; |
2199 | struct hstate *h = hstate_vma(vma); | 2242 | struct hstate *h = hstate_vma(vma); |
2200 | int zeropage_ok = 0; | ||
2201 | int shared = vma->vm_flags & VM_SHARED; | ||
2202 | 2243 | ||
2203 | spin_lock(&mm->page_table_lock); | 2244 | spin_lock(&mm->page_table_lock); |
2204 | while (vaddr < vma->vm_end && remainder) { | 2245 | while (vaddr < vma->vm_end && remainder) { |
2205 | pte_t *pte; | 2246 | pte_t *pte; |
2247 | int absent; | ||
2206 | struct page *page; | 2248 | struct page *page; |
2207 | 2249 | ||
2208 | /* | 2250 | /* |
2209 | * Some archs (sparc64, sh*) have multiple pte_ts to | 2251 | * Some archs (sparc64, sh*) have multiple pte_ts to |
2210 | * each hugepage. We have to make * sure we get the | 2252 | * each hugepage. We have to make sure we get the |
2211 | * first, for the page indexing below to work. | 2253 | * first, for the page indexing below to work. |
2212 | */ | 2254 | */ |
2213 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); | 2255 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); |
2214 | if (huge_zeropage_ok(pte, write, shared)) | 2256 | absent = !pte || huge_pte_none(huge_ptep_get(pte)); |
2215 | zeropage_ok = 1; | 2257 | |
2258 | /* | ||
2259 | * When coredumping, it suits get_dump_page if we just return | ||
2260 | * an error where there's an empty slot with no huge pagecache | ||
2261 | * to back it. This way, we avoid allocating a hugepage, and | ||
2262 | * the sparse dumpfile avoids allocating disk blocks, but its | ||
2263 | * huge holes still show up with zeroes where they need to be. | ||
2264 | */ | ||
2265 | if (absent && (flags & FOLL_DUMP) && | ||
2266 | !hugetlbfs_pagecache_present(h, vma, vaddr)) { | ||
2267 | remainder = 0; | ||
2268 | break; | ||
2269 | } | ||
2216 | 2270 | ||
2217 | if (!pte || | 2271 | if (absent || |
2218 | (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || | 2272 | ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { |
2219 | (write && !pte_write(huge_ptep_get(pte)))) { | ||
2220 | int ret; | 2273 | int ret; |
2221 | 2274 | ||
2222 | spin_unlock(&mm->page_table_lock); | 2275 | spin_unlock(&mm->page_table_lock); |
2223 | ret = hugetlb_fault(mm, vma, vaddr, write); | 2276 | ret = hugetlb_fault(mm, vma, vaddr, |
2277 | (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); | ||
2224 | spin_lock(&mm->page_table_lock); | 2278 | spin_lock(&mm->page_table_lock); |
2225 | if (!(ret & VM_FAULT_ERROR)) | 2279 | if (!(ret & VM_FAULT_ERROR)) |
2226 | continue; | 2280 | continue; |
2227 | 2281 | ||
2228 | remainder = 0; | 2282 | remainder = 0; |
2229 | if (!i) | ||
2230 | i = -EFAULT; | ||
2231 | break; | 2283 | break; |
2232 | } | 2284 | } |
2233 | 2285 | ||
@@ -2235,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2235 | page = pte_page(huge_ptep_get(pte)); | 2287 | page = pte_page(huge_ptep_get(pte)); |
2236 | same_page: | 2288 | same_page: |
2237 | if (pages) { | 2289 | if (pages) { |
2238 | if (zeropage_ok) | 2290 | pages[i] = mem_map_offset(page, pfn_offset); |
2239 | pages[i] = ZERO_PAGE(0); | ||
2240 | else | ||
2241 | pages[i] = mem_map_offset(page, pfn_offset); | ||
2242 | get_page(pages[i]); | 2291 | get_page(pages[i]); |
2243 | } | 2292 | } |
2244 | 2293 | ||
@@ -2262,7 +2311,7 @@ same_page: | |||
2262 | *length = remainder; | 2311 | *length = remainder; |
2263 | *position = vaddr; | 2312 | *position = vaddr; |
2264 | 2313 | ||
2265 | return i; | 2314 | return i ? i : -EFAULT; |
2266 | } | 2315 | } |
2267 | 2316 | ||
2268 | void hugetlb_change_protection(struct vm_area_struct *vma, | 2317 | void hugetlb_change_protection(struct vm_area_struct *vma, |