aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c251
1 files changed, 150 insertions, 101 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b16d63634777..815dbd4a6dcb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -456,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
456 h->free_huge_pages_node[nid]++; 456 h->free_huge_pages_node[nid]++;
457} 457}
458 458
459static struct page *dequeue_huge_page(struct hstate *h)
460{
461 int nid;
462 struct page *page = NULL;
463
464 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
465 if (!list_empty(&h->hugepage_freelists[nid])) {
466 page = list_entry(h->hugepage_freelists[nid].next,
467 struct page, lru);
468 list_del(&page->lru);
469 h->free_huge_pages--;
470 h->free_huge_pages_node[nid]--;
471 break;
472 }
473 }
474 return page;
475}
476
477static struct page *dequeue_huge_page_vma(struct hstate *h, 459static struct page *dequeue_huge_page_vma(struct hstate *h,
478 struct vm_area_struct *vma, 460 struct vm_area_struct *vma,
479 unsigned long address, int avoid_reserve) 461 unsigned long address, int avoid_reserve)
@@ -641,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
641 623
642/* 624/*
643 * Use a helper variable to find the next node and then 625 * Use a helper variable to find the next node and then
644 * copy it back to hugetlb_next_nid afterwards: 626 * copy it back to next_nid_to_alloc afterwards:
645 * otherwise there's a window in which a racer might 627 * otherwise there's a window in which a racer might
646 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. 628 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
647 * But we don't need to use a spin_lock here: it really 629 * But we don't need to use a spin_lock here: it really
@@ -650,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
650 * if we just successfully allocated a hugepage so that 632 * if we just successfully allocated a hugepage so that
651 * the next caller gets hugepages on the next node. 633 * the next caller gets hugepages on the next node.
652 */ 634 */
653static int hstate_next_node(struct hstate *h) 635static int hstate_next_node_to_alloc(struct hstate *h)
654{ 636{
655 int next_nid; 637 int next_nid;
656 next_nid = next_node(h->hugetlb_next_nid, node_online_map); 638 next_nid = next_node(h->next_nid_to_alloc, node_online_map);
657 if (next_nid == MAX_NUMNODES) 639 if (next_nid == MAX_NUMNODES)
658 next_nid = first_node(node_online_map); 640 next_nid = first_node(node_online_map);
659 h->hugetlb_next_nid = next_nid; 641 h->next_nid_to_alloc = next_nid;
660 return next_nid; 642 return next_nid;
661} 643}
662 644
@@ -667,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h)
667 int next_nid; 649 int next_nid;
668 int ret = 0; 650 int ret = 0;
669 651
670 start_nid = h->hugetlb_next_nid; 652 start_nid = h->next_nid_to_alloc;
653 next_nid = start_nid;
671 654
672 do { 655 do {
673 page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); 656 page = alloc_fresh_huge_page_node(h, next_nid);
674 if (page) 657 if (page)
675 ret = 1; 658 ret = 1;
676 next_nid = hstate_next_node(h); 659 next_nid = hstate_next_node_to_alloc(h);
677 } while (!page && h->hugetlb_next_nid != start_nid); 660 } while (!page && next_nid != start_nid);
678 661
679 if (ret) 662 if (ret)
680 count_vm_event(HTLB_BUDDY_PGALLOC); 663 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -684,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h)
684 return ret; 667 return ret;
685} 668}
686 669
670/*
671 * helper for free_pool_huge_page() - find next node
672 * from which to free a huge page
673 */
674static int hstate_next_node_to_free(struct hstate *h)
675{
676 int next_nid;
677 next_nid = next_node(h->next_nid_to_free, node_online_map);
678 if (next_nid == MAX_NUMNODES)
679 next_nid = first_node(node_online_map);
680 h->next_nid_to_free = next_nid;
681 return next_nid;
682}
683
684/*
685 * Free huge page from pool from next node to free.
686 * Attempt to keep persistent huge pages more or less
687 * balanced over allowed nodes.
688 * Called with hugetlb_lock locked.
689 */
690static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
691{
692 int start_nid;
693 int next_nid;
694 int ret = 0;
695
696 start_nid = h->next_nid_to_free;
697 next_nid = start_nid;
698
699 do {
700 /*
701 * If we're returning unused surplus pages, only examine
702 * nodes with surplus pages.
703 */
704 if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
705 !list_empty(&h->hugepage_freelists[next_nid])) {
706 struct page *page =
707 list_entry(h->hugepage_freelists[next_nid].next,
708 struct page, lru);
709 list_del(&page->lru);
710 h->free_huge_pages--;
711 h->free_huge_pages_node[next_nid]--;
712 if (acct_surplus) {
713 h->surplus_huge_pages--;
714 h->surplus_huge_pages_node[next_nid]--;
715 }
716 update_and_free_page(h, page);
717 ret = 1;
718 }
719 next_nid = hstate_next_node_to_free(h);
720 } while (!ret && next_nid != start_nid);
721
722 return ret;
723}
724
687static struct page *alloc_buddy_huge_page(struct hstate *h, 725static struct page *alloc_buddy_huge_page(struct hstate *h,
688 struct vm_area_struct *vma, unsigned long address) 726 struct vm_area_struct *vma, unsigned long address)
689{ 727{
@@ -855,22 +893,13 @@ free:
855 * When releasing a hugetlb pool reservation, any surplus pages that were 893 * When releasing a hugetlb pool reservation, any surplus pages that were
856 * allocated to satisfy the reservation must be explicitly freed if they were 894 * allocated to satisfy the reservation must be explicitly freed if they were
857 * never used. 895 * never used.
896 * Called with hugetlb_lock held.
858 */ 897 */
859static void return_unused_surplus_pages(struct hstate *h, 898static void return_unused_surplus_pages(struct hstate *h,
860 unsigned long unused_resv_pages) 899 unsigned long unused_resv_pages)
861{ 900{
862 static int nid = -1;
863 struct page *page;
864 unsigned long nr_pages; 901 unsigned long nr_pages;
865 902
866 /*
867 * We want to release as many surplus pages as possible, spread
868 * evenly across all nodes. Iterate across all nodes until we
869 * can no longer free unreserved surplus pages. This occurs when
870 * the nodes with surplus pages have no free pages.
871 */
872 unsigned long remaining_iterations = nr_online_nodes;
873
874 /* Uncommit the reservation */ 903 /* Uncommit the reservation */
875 h->resv_huge_pages -= unused_resv_pages; 904 h->resv_huge_pages -= unused_resv_pages;
876 905
@@ -880,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h,
880 909
881 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 910 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
882 911
883 while (remaining_iterations-- && nr_pages) { 912 /*
884 nid = next_node(nid, node_online_map); 913 * We want to release as many surplus pages as possible, spread
885 if (nid == MAX_NUMNODES) 914 * evenly across all nodes. Iterate across all nodes until we
886 nid = first_node(node_online_map); 915 * can no longer free unreserved surplus pages. This occurs when
887 916 * the nodes with surplus pages have no free pages.
888 if (!h->surplus_huge_pages_node[nid]) 917 * free_pool_huge_page() will balance the the frees across the
889 continue; 918 * on-line nodes for us and will handle the hstate accounting.
890 919 */
891 if (!list_empty(&h->hugepage_freelists[nid])) { 920 while (nr_pages--) {
892 page = list_entry(h->hugepage_freelists[nid].next, 921 if (!free_pool_huge_page(h, 1))
893 struct page, lru); 922 break;
894 list_del(&page->lru);
895 update_and_free_page(h, page);
896 h->free_huge_pages--;
897 h->free_huge_pages_node[nid]--;
898 h->surplus_huge_pages--;
899 h->surplus_huge_pages_node[nid]--;
900 nr_pages--;
901 remaining_iterations = nr_online_nodes;
902 }
903 } 923 }
904} 924}
905 925
@@ -1008,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1008 void *addr; 1028 void *addr;
1009 1029
1010 addr = __alloc_bootmem_node_nopanic( 1030 addr = __alloc_bootmem_node_nopanic(
1011 NODE_DATA(h->hugetlb_next_nid), 1031 NODE_DATA(h->next_nid_to_alloc),
1012 huge_page_size(h), huge_page_size(h), 0); 1032 huge_page_size(h), huge_page_size(h), 0);
1013 1033
1034 hstate_next_node_to_alloc(h);
1014 if (addr) { 1035 if (addr) {
1015 /* 1036 /*
1016 * Use the beginning of the huge page to store the 1037 * Use the beginning of the huge page to store the
@@ -1020,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1020 m = addr; 1041 m = addr;
1021 goto found; 1042 goto found;
1022 } 1043 }
1023 hstate_next_node(h);
1024 nr_nodes--; 1044 nr_nodes--;
1025 } 1045 }
1026 return 0; 1046 return 0;
@@ -1141,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1141 */ 1161 */
1142static int adjust_pool_surplus(struct hstate *h, int delta) 1162static int adjust_pool_surplus(struct hstate *h, int delta)
1143{ 1163{
1144 static int prev_nid; 1164 int start_nid, next_nid;
1145 int nid = prev_nid;
1146 int ret = 0; 1165 int ret = 0;
1147 1166
1148 VM_BUG_ON(delta != -1 && delta != 1); 1167 VM_BUG_ON(delta != -1 && delta != 1);
1149 do {
1150 nid = next_node(nid, node_online_map);
1151 if (nid == MAX_NUMNODES)
1152 nid = first_node(node_online_map);
1153 1168
1154 /* To shrink on this node, there must be a surplus page */ 1169 if (delta < 0)
1155 if (delta < 0 && !h->surplus_huge_pages_node[nid]) 1170 start_nid = h->next_nid_to_alloc;
1156 continue; 1171 else
1157 /* Surplus cannot exceed the total number of pages */ 1172 start_nid = h->next_nid_to_free;
1158 if (delta > 0 && h->surplus_huge_pages_node[nid] >= 1173 next_nid = start_nid;
1174
1175 do {
1176 int nid = next_nid;
1177 if (delta < 0) {
1178 next_nid = hstate_next_node_to_alloc(h);
1179 /*
1180 * To shrink on this node, there must be a surplus page
1181 */
1182 if (!h->surplus_huge_pages_node[nid])
1183 continue;
1184 }
1185 if (delta > 0) {
1186 next_nid = hstate_next_node_to_free(h);
1187 /*
1188 * Surplus cannot exceed the total number of pages
1189 */
1190 if (h->surplus_huge_pages_node[nid] >=
1159 h->nr_huge_pages_node[nid]) 1191 h->nr_huge_pages_node[nid])
1160 continue; 1192 continue;
1193 }
1161 1194
1162 h->surplus_huge_pages += delta; 1195 h->surplus_huge_pages += delta;
1163 h->surplus_huge_pages_node[nid] += delta; 1196 h->surplus_huge_pages_node[nid] += delta;
1164 ret = 1; 1197 ret = 1;
1165 break; 1198 break;
1166 } while (nid != prev_nid); 1199 } while (next_nid != start_nid);
1167 1200
1168 prev_nid = nid;
1169 return ret; 1201 return ret;
1170} 1202}
1171 1203
@@ -1227,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1227 min_count = max(count, min_count); 1259 min_count = max(count, min_count);
1228 try_to_free_low(h, min_count); 1260 try_to_free_low(h, min_count);
1229 while (min_count < persistent_huge_pages(h)) { 1261 while (min_count < persistent_huge_pages(h)) {
1230 struct page *page = dequeue_huge_page(h); 1262 if (!free_pool_huge_page(h, 0))
1231 if (!page)
1232 break; 1263 break;
1233 update_and_free_page(h, page);
1234 } 1264 }
1235 while (count < persistent_huge_pages(h)) { 1265 while (count < persistent_huge_pages(h)) {
1236 if (!adjust_pool_surplus(h, 1)) 1266 if (!adjust_pool_surplus(h, 1))
@@ -1442,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order)
1442 h->free_huge_pages = 0; 1472 h->free_huge_pages = 0;
1443 for (i = 0; i < MAX_NUMNODES; ++i) 1473 for (i = 0; i < MAX_NUMNODES; ++i)
1444 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1474 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1445 h->hugetlb_next_nid = first_node(node_online_map); 1475 h->next_nid_to_alloc = first_node(node_online_map);
1476 h->next_nid_to_free = first_node(node_online_map);
1446 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1477 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1447 huge_page_size(h)/1024); 1478 huge_page_size(h)/1024);
1448 1479
@@ -1985,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1985 return find_lock_page(mapping, idx); 2016 return find_lock_page(mapping, idx);
1986} 2017}
1987 2018
2019/*
2020 * Return whether there is a pagecache page to back given address within VMA.
2021 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
2022 */
2023static bool hugetlbfs_pagecache_present(struct hstate *h,
2024 struct vm_area_struct *vma, unsigned long address)
2025{
2026 struct address_space *mapping;
2027 pgoff_t idx;
2028 struct page *page;
2029
2030 mapping = vma->vm_file->f_mapping;
2031 idx = vma_hugecache_offset(h, vma, address);
2032
2033 page = find_get_page(mapping, idx);
2034 if (page)
2035 put_page(page);
2036 return page != NULL;
2037}
2038
1988static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 2039static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1989 unsigned long address, pte_t *ptep, unsigned int flags) 2040 unsigned long address, pte_t *ptep, unsigned int flags)
1990{ 2041{
@@ -2180,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
2180 return NULL; 2231 return NULL;
2181} 2232}
2182 2233
2183static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
2184{
2185 if (!ptep || write || shared)
2186 return 0;
2187 else
2188 return huge_pte_none(huge_ptep_get(ptep));
2189}
2190
2191int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2234int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2192 struct page **pages, struct vm_area_struct **vmas, 2235 struct page **pages, struct vm_area_struct **vmas,
2193 unsigned long *position, int *length, int i, 2236 unsigned long *position, int *length, int i,
2194 int write) 2237 unsigned int flags)
2195{ 2238{
2196 unsigned long pfn_offset; 2239 unsigned long pfn_offset;
2197 unsigned long vaddr = *position; 2240 unsigned long vaddr = *position;
2198 int remainder = *length; 2241 int remainder = *length;
2199 struct hstate *h = hstate_vma(vma); 2242 struct hstate *h = hstate_vma(vma);
2200 int zeropage_ok = 0;
2201 int shared = vma->vm_flags & VM_SHARED;
2202 2243
2203 spin_lock(&mm->page_table_lock); 2244 spin_lock(&mm->page_table_lock);
2204 while (vaddr < vma->vm_end && remainder) { 2245 while (vaddr < vma->vm_end && remainder) {
2205 pte_t *pte; 2246 pte_t *pte;
2247 int absent;
2206 struct page *page; 2248 struct page *page;
2207 2249
2208 /* 2250 /*
2209 * Some archs (sparc64, sh*) have multiple pte_ts to 2251 * Some archs (sparc64, sh*) have multiple pte_ts to
2210 * each hugepage. We have to make * sure we get the 2252 * each hugepage. We have to make sure we get the
2211 * first, for the page indexing below to work. 2253 * first, for the page indexing below to work.
2212 */ 2254 */
2213 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 2255 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2214 if (huge_zeropage_ok(pte, write, shared)) 2256 absent = !pte || huge_pte_none(huge_ptep_get(pte));
2215 zeropage_ok = 1; 2257
2258 /*
2259 * When coredumping, it suits get_dump_page if we just return
2260 * an error where there's an empty slot with no huge pagecache
2261 * to back it. This way, we avoid allocating a hugepage, and
2262 * the sparse dumpfile avoids allocating disk blocks, but its
2263 * huge holes still show up with zeroes where they need to be.
2264 */
2265 if (absent && (flags & FOLL_DUMP) &&
2266 !hugetlbfs_pagecache_present(h, vma, vaddr)) {
2267 remainder = 0;
2268 break;
2269 }
2216 2270
2217 if (!pte || 2271 if (absent ||
2218 (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || 2272 ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
2219 (write && !pte_write(huge_ptep_get(pte)))) {
2220 int ret; 2273 int ret;
2221 2274
2222 spin_unlock(&mm->page_table_lock); 2275 spin_unlock(&mm->page_table_lock);
2223 ret = hugetlb_fault(mm, vma, vaddr, write); 2276 ret = hugetlb_fault(mm, vma, vaddr,
2277 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
2224 spin_lock(&mm->page_table_lock); 2278 spin_lock(&mm->page_table_lock);
2225 if (!(ret & VM_FAULT_ERROR)) 2279 if (!(ret & VM_FAULT_ERROR))
2226 continue; 2280 continue;
2227 2281
2228 remainder = 0; 2282 remainder = 0;
2229 if (!i)
2230 i = -EFAULT;
2231 break; 2283 break;
2232 } 2284 }
2233 2285
@@ -2235,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2235 page = pte_page(huge_ptep_get(pte)); 2287 page = pte_page(huge_ptep_get(pte));
2236same_page: 2288same_page:
2237 if (pages) { 2289 if (pages) {
2238 if (zeropage_ok) 2290 pages[i] = mem_map_offset(page, pfn_offset);
2239 pages[i] = ZERO_PAGE(0);
2240 else
2241 pages[i] = mem_map_offset(page, pfn_offset);
2242 get_page(pages[i]); 2291 get_page(pages[i]);
2243 } 2292 }
2244 2293
@@ -2262,7 +2311,7 @@ same_page:
2262 *length = remainder; 2311 *length = remainder;
2263 *position = vaddr; 2312 *position = vaddr;
2264 2313
2265 return i; 2314 return i ? i : -EFAULT;
2266} 2315}
2267 2316
2268void hugetlb_change_protection(struct vm_area_struct *vma, 2317void hugetlb_change_protection(struct vm_area_struct *vma,