aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c195
1 files changed, 136 insertions, 59 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e198831276a3..bc727122dd44 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,17 +24,20 @@
24 24
25#include <asm/page.h> 25#include <asm/page.h>
26#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <linux/io.h> 27#include <asm/tlb.h>
28 28
29#include <linux/io.h>
29#include <linux/hugetlb.h> 30#include <linux/hugetlb.h>
31#include <linux/hugetlb_cgroup.h>
30#include <linux/node.h> 32#include <linux/node.h>
33#include <linux/hugetlb_cgroup.h>
31#include "internal.h" 34#include "internal.h"
32 35
33const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 36const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 37static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable; 38unsigned long hugepages_treat_as_movable;
36 39
37static int max_hstate; 40int hugetlb_max_hstate __read_mostly;
38unsigned int default_hstate_idx; 41unsigned int default_hstate_idx;
39struct hstate hstates[HUGE_MAX_HSTATE]; 42struct hstate hstates[HUGE_MAX_HSTATE];
40 43
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate;
45static unsigned long __initdata default_hstate_max_huge_pages; 48static unsigned long __initdata default_hstate_max_huge_pages;
46static unsigned long __initdata default_hstate_size; 49static unsigned long __initdata default_hstate_size;
47 50
48#define for_each_hstate(h) \
49 for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
50
51/* 51/*
52 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 52 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
53 */ 53 */
54static DEFINE_SPINLOCK(hugetlb_lock); 54DEFINE_SPINLOCK(hugetlb_lock);
55 55
56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
57{ 57{
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src)
509static void enqueue_huge_page(struct hstate *h, struct page *page) 509static void enqueue_huge_page(struct hstate *h, struct page *page)
510{ 510{
511 int nid = page_to_nid(page); 511 int nid = page_to_nid(page);
512 list_add(&page->lru, &h->hugepage_freelists[nid]); 512 list_move(&page->lru, &h->hugepage_freelists[nid]);
513 h->free_huge_pages++; 513 h->free_huge_pages++;
514 h->free_huge_pages_node[nid]++; 514 h->free_huge_pages_node[nid]++;
515} 515}
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
521 if (list_empty(&h->hugepage_freelists[nid])) 521 if (list_empty(&h->hugepage_freelists[nid]))
522 return NULL; 522 return NULL;
523 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); 523 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
524 list_del(&page->lru); 524 list_move(&page->lru, &h->hugepage_activelist);
525 set_page_refcounted(page); 525 set_page_refcounted(page);
526 h->free_huge_pages--; 526 h->free_huge_pages--;
527 h->free_huge_pages_node[nid]--; 527 h->free_huge_pages_node[nid]--;
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
593 1 << PG_active | 1 << PG_reserved | 593 1 << PG_active | 1 << PG_reserved |
594 1 << PG_private | 1 << PG_writeback); 594 1 << PG_private | 1 << PG_writeback);
595 } 595 }
596 VM_BUG_ON(hugetlb_cgroup_from_page(page));
596 set_compound_page_dtor(page, NULL); 597 set_compound_page_dtor(page, NULL);
597 set_page_refcounted(page); 598 set_page_refcounted(page);
598 arch_release_hugepage(page); 599 arch_release_hugepage(page);
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page)
625 page->mapping = NULL; 626 page->mapping = NULL;
626 BUG_ON(page_count(page)); 627 BUG_ON(page_count(page));
627 BUG_ON(page_mapcount(page)); 628 BUG_ON(page_mapcount(page));
628 INIT_LIST_HEAD(&page->lru);
629 629
630 spin_lock(&hugetlb_lock); 630 spin_lock(&hugetlb_lock);
631 hugetlb_cgroup_uncharge_page(hstate_index(h),
632 pages_per_huge_page(h), page);
631 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { 633 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
634 /* remove the page from active list */
635 list_del(&page->lru);
632 update_and_free_page(h, page); 636 update_and_free_page(h, page);
633 h->surplus_huge_pages--; 637 h->surplus_huge_pages--;
634 h->surplus_huge_pages_node[nid]--; 638 h->surplus_huge_pages_node[nid]--;
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page)
641 645
642static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 646static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
643{ 647{
648 INIT_LIST_HEAD(&page->lru);
644 set_compound_page_dtor(page, free_huge_page); 649 set_compound_page_dtor(page, free_huge_page);
645 spin_lock(&hugetlb_lock); 650 spin_lock(&hugetlb_lock);
651 set_hugetlb_cgroup(page, NULL);
646 h->nr_huge_pages++; 652 h->nr_huge_pages++;
647 h->nr_huge_pages_node[nid]++; 653 h->nr_huge_pages_node[nid]++;
648 spin_unlock(&hugetlb_lock); 654 spin_unlock(&hugetlb_lock);
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
889 895
890 spin_lock(&hugetlb_lock); 896 spin_lock(&hugetlb_lock);
891 if (page) { 897 if (page) {
898 INIT_LIST_HEAD(&page->lru);
892 r_nid = page_to_nid(page); 899 r_nid = page_to_nid(page);
893 set_compound_page_dtor(page, free_huge_page); 900 set_compound_page_dtor(page, free_huge_page);
901 set_hugetlb_cgroup(page, NULL);
894 /* 902 /*
895 * We incremented the global counters already 903 * We incremented the global counters already
896 */ 904 */
@@ -993,7 +1001,6 @@ retry:
993 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1001 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
994 if ((--needed) < 0) 1002 if ((--needed) < 0)
995 break; 1003 break;
996 list_del(&page->lru);
997 /* 1004 /*
998 * This page is now managed by the hugetlb allocator and has 1005 * This page is now managed by the hugetlb allocator and has
999 * no users -- drop the buddy allocator's reference. 1006 * no users -- drop the buddy allocator's reference.
@@ -1008,7 +1015,6 @@ free:
1008 /* Free unnecessary surplus pages to the buddy allocator */ 1015 /* Free unnecessary surplus pages to the buddy allocator */
1009 if (!list_empty(&surplus_list)) { 1016 if (!list_empty(&surplus_list)) {
1010 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1017 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
1011 list_del(&page->lru);
1012 put_page(page); 1018 put_page(page);
1013 } 1019 }
1014 } 1020 }
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1112 struct hstate *h = hstate_vma(vma); 1118 struct hstate *h = hstate_vma(vma);
1113 struct page *page; 1119 struct page *page;
1114 long chg; 1120 long chg;
1121 int ret, idx;
1122 struct hugetlb_cgroup *h_cg;
1115 1123
1124 idx = hstate_index(h);
1116 /* 1125 /*
1117 * Processes that did not create the mapping will have no 1126 * Processes that did not create the mapping will have no
1118 * reserves and will not have accounted against subpool 1127 * reserves and will not have accounted against subpool
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1123 */ 1132 */
1124 chg = vma_needs_reservation(h, vma, addr); 1133 chg = vma_needs_reservation(h, vma, addr);
1125 if (chg < 0) 1134 if (chg < 0)
1126 return ERR_PTR(-VM_FAULT_OOM); 1135 return ERR_PTR(-ENOMEM);
1127 if (chg) 1136 if (chg)
1128 if (hugepage_subpool_get_pages(spool, chg)) 1137 if (hugepage_subpool_get_pages(spool, chg))
1129 return ERR_PTR(-VM_FAULT_SIGBUS); 1138 return ERR_PTR(-ENOSPC);
1130 1139
1140 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1141 if (ret) {
1142 hugepage_subpool_put_pages(spool, chg);
1143 return ERR_PTR(-ENOSPC);
1144 }
1131 spin_lock(&hugetlb_lock); 1145 spin_lock(&hugetlb_lock);
1132 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); 1146 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
1133 spin_unlock(&hugetlb_lock); 1147 if (page) {
1134 1148 /* update page cgroup details */
1135 if (!page) { 1149 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1150 h_cg, page);
1151 spin_unlock(&hugetlb_lock);
1152 } else {
1153 spin_unlock(&hugetlb_lock);
1136 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1154 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1137 if (!page) { 1155 if (!page) {
1156 hugetlb_cgroup_uncharge_cgroup(idx,
1157 pages_per_huge_page(h),
1158 h_cg);
1138 hugepage_subpool_put_pages(spool, chg); 1159 hugepage_subpool_put_pages(spool, chg);
1139 return ERR_PTR(-VM_FAULT_SIGBUS); 1160 return ERR_PTR(-ENOSPC);
1140 } 1161 }
1162 spin_lock(&hugetlb_lock);
1163 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1164 h_cg, page);
1165 list_move(&page->lru, &h->hugepage_activelist);
1166 spin_unlock(&hugetlb_lock);
1141 } 1167 }
1142 1168
1143 set_page_private(page, (unsigned long)spool); 1169 set_page_private(page, (unsigned long)spool);
1144 1170
1145 vma_commit_reservation(h, vma, addr); 1171 vma_commit_reservation(h, vma, addr);
1146
1147 return page; 1172 return page;
1148} 1173}
1149 1174
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1646 struct attribute_group *hstate_attr_group) 1671 struct attribute_group *hstate_attr_group)
1647{ 1672{
1648 int retval; 1673 int retval;
1649 int hi = h - hstates; 1674 int hi = hstate_index(h);
1650 1675
1651 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 1676 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1652 if (!hstate_kobjs[hi]) 1677 if (!hstate_kobjs[hi])
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node)
1741 if (!nhs->hugepages_kobj) 1766 if (!nhs->hugepages_kobj)
1742 return; /* no hstate attributes */ 1767 return; /* no hstate attributes */
1743 1768
1744 for_each_hstate(h) 1769 for_each_hstate(h) {
1745 if (nhs->hstate_kobjs[h - hstates]) { 1770 int idx = hstate_index(h);
1746 kobject_put(nhs->hstate_kobjs[h - hstates]); 1771 if (nhs->hstate_kobjs[idx]) {
1747 nhs->hstate_kobjs[h - hstates] = NULL; 1772 kobject_put(nhs->hstate_kobjs[idx]);
1773 nhs->hstate_kobjs[idx] = NULL;
1748 } 1774 }
1775 }
1749 1776
1750 kobject_put(nhs->hugepages_kobj); 1777 kobject_put(nhs->hugepages_kobj);
1751 nhs->hugepages_kobj = NULL; 1778 nhs->hugepages_kobj = NULL;
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void)
1848 hugetlb_unregister_all_nodes(); 1875 hugetlb_unregister_all_nodes();
1849 1876
1850 for_each_hstate(h) { 1877 for_each_hstate(h) {
1851 kobject_put(hstate_kobjs[h - hstates]); 1878 kobject_put(hstate_kobjs[hstate_index(h)]);
1852 } 1879 }
1853 1880
1854 kobject_put(hugepages_kobj); 1881 kobject_put(hugepages_kobj);
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void)
1869 if (!size_to_hstate(default_hstate_size)) 1896 if (!size_to_hstate(default_hstate_size))
1870 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 1897 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1871 } 1898 }
1872 default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; 1899 default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
1873 if (default_hstate_max_huge_pages) 1900 if (default_hstate_max_huge_pages)
1874 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 1901 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1875 1902
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order)
1897 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); 1924 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1898 return; 1925 return;
1899 } 1926 }
1900 BUG_ON(max_hstate >= HUGE_MAX_HSTATE); 1927 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
1901 BUG_ON(order == 0); 1928 BUG_ON(order == 0);
1902 h = &hstates[max_hstate++]; 1929 h = &hstates[hugetlb_max_hstate++];
1903 h->order = order; 1930 h->order = order;
1904 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 1931 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1905 h->nr_huge_pages = 0; 1932 h->nr_huge_pages = 0;
1906 h->free_huge_pages = 0; 1933 h->free_huge_pages = 0;
1907 for (i = 0; i < MAX_NUMNODES; ++i) 1934 for (i = 0; i < MAX_NUMNODES; ++i)
1908 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1935 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1936 INIT_LIST_HEAD(&h->hugepage_activelist);
1909 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); 1937 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
1910 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); 1938 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
1911 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1939 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1912 huge_page_size(h)/1024); 1940 huge_page_size(h)/1024);
1941 /*
1942 * Add cgroup control files only if the huge page consists
1943 * of more than two normal pages. This is because we use
1944 * page[2].lru.next for storing cgoup details.
1945 */
1946 if (order >= HUGETLB_CGROUP_MIN_ORDER)
1947 hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
1913 1948
1914 parsed_hstate = h; 1949 parsed_hstate = h;
1915} 1950}
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s)
1920 static unsigned long *last_mhp; 1955 static unsigned long *last_mhp;
1921 1956
1922 /* 1957 /*
1923 * !max_hstate means we haven't parsed a hugepagesz= parameter yet, 1958 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
1924 * so this hugepages= parameter goes to the "default hstate". 1959 * so this hugepages= parameter goes to the "default hstate".
1925 */ 1960 */
1926 if (!max_hstate) 1961 if (!hugetlb_max_hstate)
1927 mhp = &default_hstate_max_huge_pages; 1962 mhp = &default_hstate_max_huge_pages;
1928 else 1963 else
1929 mhp = &parsed_hstate->max_huge_pages; 1964 mhp = &parsed_hstate->max_huge_pages;
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s)
1942 * But we need to allocate >= MAX_ORDER hstates here early to still 1977 * But we need to allocate >= MAX_ORDER hstates here early to still
1943 * use the bootmem allocator. 1978 * use the bootmem allocator.
1944 */ 1979 */
1945 if (max_hstate && parsed_hstate->order >= MAX_ORDER) 1980 if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
1946 hugetlb_hstate_alloc_pages(parsed_hstate); 1981 hugetlb_hstate_alloc_pages(parsed_hstate);
1947 1982
1948 last_mhp = mhp; 1983 last_mhp = mhp;
@@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2308 return 0; 2343 return 0;
2309} 2344}
2310 2345
2311void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2346void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2312 unsigned long end, struct page *ref_page) 2347 unsigned long start, unsigned long end,
2348 struct page *ref_page)
2313{ 2349{
2350 int force_flush = 0;
2314 struct mm_struct *mm = vma->vm_mm; 2351 struct mm_struct *mm = vma->vm_mm;
2315 unsigned long address; 2352 unsigned long address;
2316 pte_t *ptep; 2353 pte_t *ptep;
2317 pte_t pte; 2354 pte_t pte;
2318 struct page *page; 2355 struct page *page;
2319 struct page *tmp;
2320 struct hstate *h = hstate_vma(vma); 2356 struct hstate *h = hstate_vma(vma);
2321 unsigned long sz = huge_page_size(h); 2357 unsigned long sz = huge_page_size(h);
2322 2358
2323 /*
2324 * A page gathering list, protected by per file i_mmap_mutex. The
2325 * lock is used to avoid list corruption from multiple unmapping
2326 * of the same page since we are using page->lru.
2327 */
2328 LIST_HEAD(page_list);
2329
2330 WARN_ON(!is_vm_hugetlb_page(vma)); 2359 WARN_ON(!is_vm_hugetlb_page(vma));
2331 BUG_ON(start & ~huge_page_mask(h)); 2360 BUG_ON(start & ~huge_page_mask(h));
2332 BUG_ON(end & ~huge_page_mask(h)); 2361 BUG_ON(end & ~huge_page_mask(h));
2333 2362
2363 tlb_start_vma(tlb, vma);
2334 mmu_notifier_invalidate_range_start(mm, start, end); 2364 mmu_notifier_invalidate_range_start(mm, start, end);
2365again:
2335 spin_lock(&mm->page_table_lock); 2366 spin_lock(&mm->page_table_lock);
2336 for (address = start; address < end; address += sz) { 2367 for (address = start; address < end; address += sz) {
2337 ptep = huge_pte_offset(mm, address); 2368 ptep = huge_pte_offset(mm, address);
@@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2370 } 2401 }
2371 2402
2372 pte = huge_ptep_get_and_clear(mm, address, ptep); 2403 pte = huge_ptep_get_and_clear(mm, address, ptep);
2404 tlb_remove_tlb_entry(tlb, ptep, address);
2373 if (pte_dirty(pte)) 2405 if (pte_dirty(pte))
2374 set_page_dirty(page); 2406 set_page_dirty(page);
2375 list_add(&page->lru, &page_list);
2376 2407
2408 page_remove_rmap(page);
2409 force_flush = !__tlb_remove_page(tlb, page);
2410 if (force_flush)
2411 break;
2377 /* Bail out after unmapping reference page if supplied */ 2412 /* Bail out after unmapping reference page if supplied */
2378 if (ref_page) 2413 if (ref_page)
2379 break; 2414 break;
2380 } 2415 }
2381 flush_tlb_range(vma, start, end);
2382 spin_unlock(&mm->page_table_lock); 2416 spin_unlock(&mm->page_table_lock);
2383 mmu_notifier_invalidate_range_end(mm, start, end); 2417 /*
2384 list_for_each_entry_safe(page, tmp, &page_list, lru) { 2418 * mmu_gather ran out of room to batch pages, we break out of
2385 page_remove_rmap(page); 2419 * the PTE lock to avoid doing the potential expensive TLB invalidate
2386 list_del(&page->lru); 2420 * and page-free while holding it.
2387 put_page(page); 2421 */
2422 if (force_flush) {
2423 force_flush = 0;
2424 tlb_flush_mmu(tlb);
2425 if (address < end && !ref_page)
2426 goto again;
2388 } 2427 }
2428 mmu_notifier_invalidate_range_end(mm, start, end);
2429 tlb_end_vma(tlb, vma);
2430}
2431
2432void __unmap_hugepage_range_final(struct mmu_gather *tlb,
2433 struct vm_area_struct *vma, unsigned long start,
2434 unsigned long end, struct page *ref_page)
2435{
2436 __unmap_hugepage_range(tlb, vma, start, end, ref_page);
2437
2438 /*
2439 * Clear this flag so that x86's huge_pmd_share page_table_shareable
2440 * test will fail on a vma being torn down, and not grab a page table
2441 * on its way out. We're lucky that the flag has such an appropriate
2442 * name, and can in fact be safely cleared here. We could clear it
2443 * before the __unmap_hugepage_range above, but all that's necessary
2444 * is to clear it before releasing the i_mmap_mutex. This works
2445 * because in the context this is called, the VMA is about to be
2446 * destroyed and the i_mmap_mutex is held.
2447 */
2448 vma->vm_flags &= ~VM_MAYSHARE;
2389} 2449}
2390 2450
2391void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2451void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2392 unsigned long end, struct page *ref_page) 2452 unsigned long end, struct page *ref_page)
2393{ 2453{
2394 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 2454 struct mm_struct *mm;
2395 __unmap_hugepage_range(vma, start, end, ref_page); 2455 struct mmu_gather tlb;
2396 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 2456
2457 mm = vma->vm_mm;
2458
2459 tlb_gather_mmu(&tlb, mm, 0);
2460 __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
2461 tlb_finish_mmu(&tlb, start, end);
2397} 2462}
2398 2463
2399/* 2464/*
@@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2438 * from the time of fork. This would look like data corruption 2503 * from the time of fork. This would look like data corruption
2439 */ 2504 */
2440 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2505 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
2441 __unmap_hugepage_range(iter_vma, 2506 unmap_hugepage_range(iter_vma, address,
2442 address, address + huge_page_size(h), 2507 address + huge_page_size(h), page);
2443 page);
2444 } 2508 }
2445 mutex_unlock(&mapping->i_mmap_mutex); 2509 mutex_unlock(&mapping->i_mmap_mutex);
2446 2510
@@ -2496,6 +2560,7 @@ retry_avoidcopy:
2496 new_page = alloc_huge_page(vma, address, outside_reserve); 2560 new_page = alloc_huge_page(vma, address, outside_reserve);
2497 2561
2498 if (IS_ERR(new_page)) { 2562 if (IS_ERR(new_page)) {
2563 long err = PTR_ERR(new_page);
2499 page_cache_release(old_page); 2564 page_cache_release(old_page);
2500 2565
2501 /* 2566 /*
@@ -2524,7 +2589,10 @@ retry_avoidcopy:
2524 2589
2525 /* Caller expects lock to be held */ 2590 /* Caller expects lock to be held */
2526 spin_lock(&mm->page_table_lock); 2591 spin_lock(&mm->page_table_lock);
2527 return -PTR_ERR(new_page); 2592 if (err == -ENOMEM)
2593 return VM_FAULT_OOM;
2594 else
2595 return VM_FAULT_SIGBUS;
2528 } 2596 }
2529 2597
2530 /* 2598 /*
@@ -2642,7 +2710,11 @@ retry:
2642 goto out; 2710 goto out;
2643 page = alloc_huge_page(vma, address, 0); 2711 page = alloc_huge_page(vma, address, 0);
2644 if (IS_ERR(page)) { 2712 if (IS_ERR(page)) {
2645 ret = -PTR_ERR(page); 2713 ret = PTR_ERR(page);
2714 if (ret == -ENOMEM)
2715 ret = VM_FAULT_OOM;
2716 else
2717 ret = VM_FAULT_SIGBUS;
2646 goto out; 2718 goto out;
2647 } 2719 }
2648 clear_huge_page(page, address, pages_per_huge_page(h)); 2720 clear_huge_page(page, address, pages_per_huge_page(h));
@@ -2679,7 +2751,7 @@ retry:
2679 */ 2751 */
2680 if (unlikely(PageHWPoison(page))) { 2752 if (unlikely(PageHWPoison(page))) {
2681 ret = VM_FAULT_HWPOISON | 2753 ret = VM_FAULT_HWPOISON |
2682 VM_FAULT_SET_HINDEX(h - hstates); 2754 VM_FAULT_SET_HINDEX(hstate_index(h));
2683 goto backout_unlocked; 2755 goto backout_unlocked;
2684 } 2756 }
2685 } 2757 }
@@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2752 return 0; 2824 return 0;
2753 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2825 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2754 return VM_FAULT_HWPOISON_LARGE | 2826 return VM_FAULT_HWPOISON_LARGE |
2755 VM_FAULT_SET_HINDEX(h - hstates); 2827 VM_FAULT_SET_HINDEX(hstate_index(h));
2756 } 2828 }
2757 2829
2758 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2830 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2959 } 3031 }
2960 } 3032 }
2961 spin_unlock(&mm->page_table_lock); 3033 spin_unlock(&mm->page_table_lock);
2962 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 3034 /*
2963 3035 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
3036 * may have cleared our pud entry and done put_page on the page table:
3037 * once we release i_mmap_mutex, another task can do the final put_page
3038 * and that page table be reused and filled with junk.
3039 */
2964 flush_tlb_range(vma, start, end); 3040 flush_tlb_range(vma, start, end);
3041 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2965} 3042}
2966 3043
2967int hugetlb_reserve_pages(struct inode *inode, 3044int hugetlb_reserve_pages(struct inode *inode,