diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 195 |
1 files changed, 136 insertions, 59 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e198831276a3..bc727122dd44 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -24,17 +24,20 @@ | |||
24 | 24 | ||
25 | #include <asm/page.h> | 25 | #include <asm/page.h> |
26 | #include <asm/pgtable.h> | 26 | #include <asm/pgtable.h> |
27 | #include <linux/io.h> | 27 | #include <asm/tlb.h> |
28 | 28 | ||
29 | #include <linux/io.h> | ||
29 | #include <linux/hugetlb.h> | 30 | #include <linux/hugetlb.h> |
31 | #include <linux/hugetlb_cgroup.h> | ||
30 | #include <linux/node.h> | 32 | #include <linux/node.h> |
33 | #include <linux/hugetlb_cgroup.h> | ||
31 | #include "internal.h" | 34 | #include "internal.h" |
32 | 35 | ||
33 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 36 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
34 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 37 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
35 | unsigned long hugepages_treat_as_movable; | 38 | unsigned long hugepages_treat_as_movable; |
36 | 39 | ||
37 | static int max_hstate; | 40 | int hugetlb_max_hstate __read_mostly; |
38 | unsigned int default_hstate_idx; | 41 | unsigned int default_hstate_idx; |
39 | struct hstate hstates[HUGE_MAX_HSTATE]; | 42 | struct hstate hstates[HUGE_MAX_HSTATE]; |
40 | 43 | ||
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate; | |||
45 | static unsigned long __initdata default_hstate_max_huge_pages; | 48 | static unsigned long __initdata default_hstate_max_huge_pages; |
46 | static unsigned long __initdata default_hstate_size; | 49 | static unsigned long __initdata default_hstate_size; |
47 | 50 | ||
48 | #define for_each_hstate(h) \ | ||
49 | for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) | ||
50 | |||
51 | /* | 51 | /* |
52 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 52 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
53 | */ | 53 | */ |
54 | static DEFINE_SPINLOCK(hugetlb_lock); | 54 | DEFINE_SPINLOCK(hugetlb_lock); |
55 | 55 | ||
56 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | 56 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) |
57 | { | 57 | { |
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src) | |||
509 | static void enqueue_huge_page(struct hstate *h, struct page *page) | 509 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
510 | { | 510 | { |
511 | int nid = page_to_nid(page); | 511 | int nid = page_to_nid(page); |
512 | list_add(&page->lru, &h->hugepage_freelists[nid]); | 512 | list_move(&page->lru, &h->hugepage_freelists[nid]); |
513 | h->free_huge_pages++; | 513 | h->free_huge_pages++; |
514 | h->free_huge_pages_node[nid]++; | 514 | h->free_huge_pages_node[nid]++; |
515 | } | 515 | } |
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | |||
521 | if (list_empty(&h->hugepage_freelists[nid])) | 521 | if (list_empty(&h->hugepage_freelists[nid])) |
522 | return NULL; | 522 | return NULL; |
523 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); | 523 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); |
524 | list_del(&page->lru); | 524 | list_move(&page->lru, &h->hugepage_activelist); |
525 | set_page_refcounted(page); | 525 | set_page_refcounted(page); |
526 | h->free_huge_pages--; | 526 | h->free_huge_pages--; |
527 | h->free_huge_pages_node[nid]--; | 527 | h->free_huge_pages_node[nid]--; |
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) | |||
593 | 1 << PG_active | 1 << PG_reserved | | 593 | 1 << PG_active | 1 << PG_reserved | |
594 | 1 << PG_private | 1 << PG_writeback); | 594 | 1 << PG_private | 1 << PG_writeback); |
595 | } | 595 | } |
596 | VM_BUG_ON(hugetlb_cgroup_from_page(page)); | ||
596 | set_compound_page_dtor(page, NULL); | 597 | set_compound_page_dtor(page, NULL); |
597 | set_page_refcounted(page); | 598 | set_page_refcounted(page); |
598 | arch_release_hugepage(page); | 599 | arch_release_hugepage(page); |
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page) | |||
625 | page->mapping = NULL; | 626 | page->mapping = NULL; |
626 | BUG_ON(page_count(page)); | 627 | BUG_ON(page_count(page)); |
627 | BUG_ON(page_mapcount(page)); | 628 | BUG_ON(page_mapcount(page)); |
628 | INIT_LIST_HEAD(&page->lru); | ||
629 | 629 | ||
630 | spin_lock(&hugetlb_lock); | 630 | spin_lock(&hugetlb_lock); |
631 | hugetlb_cgroup_uncharge_page(hstate_index(h), | ||
632 | pages_per_huge_page(h), page); | ||
631 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { | 633 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { |
634 | /* remove the page from active list */ | ||
635 | list_del(&page->lru); | ||
632 | update_and_free_page(h, page); | 636 | update_and_free_page(h, page); |
633 | h->surplus_huge_pages--; | 637 | h->surplus_huge_pages--; |
634 | h->surplus_huge_pages_node[nid]--; | 638 | h->surplus_huge_pages_node[nid]--; |
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page) | |||
641 | 645 | ||
642 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 646 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
643 | { | 647 | { |
648 | INIT_LIST_HEAD(&page->lru); | ||
644 | set_compound_page_dtor(page, free_huge_page); | 649 | set_compound_page_dtor(page, free_huge_page); |
645 | spin_lock(&hugetlb_lock); | 650 | spin_lock(&hugetlb_lock); |
651 | set_hugetlb_cgroup(page, NULL); | ||
646 | h->nr_huge_pages++; | 652 | h->nr_huge_pages++; |
647 | h->nr_huge_pages_node[nid]++; | 653 | h->nr_huge_pages_node[nid]++; |
648 | spin_unlock(&hugetlb_lock); | 654 | spin_unlock(&hugetlb_lock); |
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |||
889 | 895 | ||
890 | spin_lock(&hugetlb_lock); | 896 | spin_lock(&hugetlb_lock); |
891 | if (page) { | 897 | if (page) { |
898 | INIT_LIST_HEAD(&page->lru); | ||
892 | r_nid = page_to_nid(page); | 899 | r_nid = page_to_nid(page); |
893 | set_compound_page_dtor(page, free_huge_page); | 900 | set_compound_page_dtor(page, free_huge_page); |
901 | set_hugetlb_cgroup(page, NULL); | ||
894 | /* | 902 | /* |
895 | * We incremented the global counters already | 903 | * We incremented the global counters already |
896 | */ | 904 | */ |
@@ -993,7 +1001,6 @@ retry: | |||
993 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 1001 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
994 | if ((--needed) < 0) | 1002 | if ((--needed) < 0) |
995 | break; | 1003 | break; |
996 | list_del(&page->lru); | ||
997 | /* | 1004 | /* |
998 | * This page is now managed by the hugetlb allocator and has | 1005 | * This page is now managed by the hugetlb allocator and has |
999 | * no users -- drop the buddy allocator's reference. | 1006 | * no users -- drop the buddy allocator's reference. |
@@ -1008,7 +1015,6 @@ free: | |||
1008 | /* Free unnecessary surplus pages to the buddy allocator */ | 1015 | /* Free unnecessary surplus pages to the buddy allocator */ |
1009 | if (!list_empty(&surplus_list)) { | 1016 | if (!list_empty(&surplus_list)) { |
1010 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 1017 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
1011 | list_del(&page->lru); | ||
1012 | put_page(page); | 1018 | put_page(page); |
1013 | } | 1019 | } |
1014 | } | 1020 | } |
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1112 | struct hstate *h = hstate_vma(vma); | 1118 | struct hstate *h = hstate_vma(vma); |
1113 | struct page *page; | 1119 | struct page *page; |
1114 | long chg; | 1120 | long chg; |
1121 | int ret, idx; | ||
1122 | struct hugetlb_cgroup *h_cg; | ||
1115 | 1123 | ||
1124 | idx = hstate_index(h); | ||
1116 | /* | 1125 | /* |
1117 | * Processes that did not create the mapping will have no | 1126 | * Processes that did not create the mapping will have no |
1118 | * reserves and will not have accounted against subpool | 1127 | * reserves and will not have accounted against subpool |
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1123 | */ | 1132 | */ |
1124 | chg = vma_needs_reservation(h, vma, addr); | 1133 | chg = vma_needs_reservation(h, vma, addr); |
1125 | if (chg < 0) | 1134 | if (chg < 0) |
1126 | return ERR_PTR(-VM_FAULT_OOM); | 1135 | return ERR_PTR(-ENOMEM); |
1127 | if (chg) | 1136 | if (chg) |
1128 | if (hugepage_subpool_get_pages(spool, chg)) | 1137 | if (hugepage_subpool_get_pages(spool, chg)) |
1129 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1138 | return ERR_PTR(-ENOSPC); |
1130 | 1139 | ||
1140 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); | ||
1141 | if (ret) { | ||
1142 | hugepage_subpool_put_pages(spool, chg); | ||
1143 | return ERR_PTR(-ENOSPC); | ||
1144 | } | ||
1131 | spin_lock(&hugetlb_lock); | 1145 | spin_lock(&hugetlb_lock); |
1132 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); | 1146 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); |
1133 | spin_unlock(&hugetlb_lock); | 1147 | if (page) { |
1134 | 1148 | /* update page cgroup details */ | |
1135 | if (!page) { | 1149 | hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), |
1150 | h_cg, page); | ||
1151 | spin_unlock(&hugetlb_lock); | ||
1152 | } else { | ||
1153 | spin_unlock(&hugetlb_lock); | ||
1136 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1154 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1137 | if (!page) { | 1155 | if (!page) { |
1156 | hugetlb_cgroup_uncharge_cgroup(idx, | ||
1157 | pages_per_huge_page(h), | ||
1158 | h_cg); | ||
1138 | hugepage_subpool_put_pages(spool, chg); | 1159 | hugepage_subpool_put_pages(spool, chg); |
1139 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1160 | return ERR_PTR(-ENOSPC); |
1140 | } | 1161 | } |
1162 | spin_lock(&hugetlb_lock); | ||
1163 | hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), | ||
1164 | h_cg, page); | ||
1165 | list_move(&page->lru, &h->hugepage_activelist); | ||
1166 | spin_unlock(&hugetlb_lock); | ||
1141 | } | 1167 | } |
1142 | 1168 | ||
1143 | set_page_private(page, (unsigned long)spool); | 1169 | set_page_private(page, (unsigned long)spool); |
1144 | 1170 | ||
1145 | vma_commit_reservation(h, vma, addr); | 1171 | vma_commit_reservation(h, vma, addr); |
1146 | |||
1147 | return page; | 1172 | return page; |
1148 | } | 1173 | } |
1149 | 1174 | ||
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, | |||
1646 | struct attribute_group *hstate_attr_group) | 1671 | struct attribute_group *hstate_attr_group) |
1647 | { | 1672 | { |
1648 | int retval; | 1673 | int retval; |
1649 | int hi = h - hstates; | 1674 | int hi = hstate_index(h); |
1650 | 1675 | ||
1651 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); | 1676 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); |
1652 | if (!hstate_kobjs[hi]) | 1677 | if (!hstate_kobjs[hi]) |
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node) | |||
1741 | if (!nhs->hugepages_kobj) | 1766 | if (!nhs->hugepages_kobj) |
1742 | return; /* no hstate attributes */ | 1767 | return; /* no hstate attributes */ |
1743 | 1768 | ||
1744 | for_each_hstate(h) | 1769 | for_each_hstate(h) { |
1745 | if (nhs->hstate_kobjs[h - hstates]) { | 1770 | int idx = hstate_index(h); |
1746 | kobject_put(nhs->hstate_kobjs[h - hstates]); | 1771 | if (nhs->hstate_kobjs[idx]) { |
1747 | nhs->hstate_kobjs[h - hstates] = NULL; | 1772 | kobject_put(nhs->hstate_kobjs[idx]); |
1773 | nhs->hstate_kobjs[idx] = NULL; | ||
1748 | } | 1774 | } |
1775 | } | ||
1749 | 1776 | ||
1750 | kobject_put(nhs->hugepages_kobj); | 1777 | kobject_put(nhs->hugepages_kobj); |
1751 | nhs->hugepages_kobj = NULL; | 1778 | nhs->hugepages_kobj = NULL; |
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void) | |||
1848 | hugetlb_unregister_all_nodes(); | 1875 | hugetlb_unregister_all_nodes(); |
1849 | 1876 | ||
1850 | for_each_hstate(h) { | 1877 | for_each_hstate(h) { |
1851 | kobject_put(hstate_kobjs[h - hstates]); | 1878 | kobject_put(hstate_kobjs[hstate_index(h)]); |
1852 | } | 1879 | } |
1853 | 1880 | ||
1854 | kobject_put(hugepages_kobj); | 1881 | kobject_put(hugepages_kobj); |
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void) | |||
1869 | if (!size_to_hstate(default_hstate_size)) | 1896 | if (!size_to_hstate(default_hstate_size)) |
1870 | hugetlb_add_hstate(HUGETLB_PAGE_ORDER); | 1897 | hugetlb_add_hstate(HUGETLB_PAGE_ORDER); |
1871 | } | 1898 | } |
1872 | default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; | 1899 | default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); |
1873 | if (default_hstate_max_huge_pages) | 1900 | if (default_hstate_max_huge_pages) |
1874 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; | 1901 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; |
1875 | 1902 | ||
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1897 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); | 1924 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); |
1898 | return; | 1925 | return; |
1899 | } | 1926 | } |
1900 | BUG_ON(max_hstate >= HUGE_MAX_HSTATE); | 1927 | BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); |
1901 | BUG_ON(order == 0); | 1928 | BUG_ON(order == 0); |
1902 | h = &hstates[max_hstate++]; | 1929 | h = &hstates[hugetlb_max_hstate++]; |
1903 | h->order = order; | 1930 | h->order = order; |
1904 | h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); | 1931 | h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); |
1905 | h->nr_huge_pages = 0; | 1932 | h->nr_huge_pages = 0; |
1906 | h->free_huge_pages = 0; | 1933 | h->free_huge_pages = 0; |
1907 | for (i = 0; i < MAX_NUMNODES; ++i) | 1934 | for (i = 0; i < MAX_NUMNODES; ++i) |
1908 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1935 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1936 | INIT_LIST_HEAD(&h->hugepage_activelist); | ||
1909 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); | 1937 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); |
1910 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); | 1938 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); |
1911 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1939 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1912 | huge_page_size(h)/1024); | 1940 | huge_page_size(h)/1024); |
1941 | /* | ||
1942 | * Add cgroup control files only if the huge page consists | ||
1943 | * of more than two normal pages. This is because we use | ||
1944 | * page[2].lru.next for storing cgoup details. | ||
1945 | */ | ||
1946 | if (order >= HUGETLB_CGROUP_MIN_ORDER) | ||
1947 | hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); | ||
1913 | 1948 | ||
1914 | parsed_hstate = h; | 1949 | parsed_hstate = h; |
1915 | } | 1950 | } |
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s) | |||
1920 | static unsigned long *last_mhp; | 1955 | static unsigned long *last_mhp; |
1921 | 1956 | ||
1922 | /* | 1957 | /* |
1923 | * !max_hstate means we haven't parsed a hugepagesz= parameter yet, | 1958 | * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, |
1924 | * so this hugepages= parameter goes to the "default hstate". | 1959 | * so this hugepages= parameter goes to the "default hstate". |
1925 | */ | 1960 | */ |
1926 | if (!max_hstate) | 1961 | if (!hugetlb_max_hstate) |
1927 | mhp = &default_hstate_max_huge_pages; | 1962 | mhp = &default_hstate_max_huge_pages; |
1928 | else | 1963 | else |
1929 | mhp = &parsed_hstate->max_huge_pages; | 1964 | mhp = &parsed_hstate->max_huge_pages; |
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s) | |||
1942 | * But we need to allocate >= MAX_ORDER hstates here early to still | 1977 | * But we need to allocate >= MAX_ORDER hstates here early to still |
1943 | * use the bootmem allocator. | 1978 | * use the bootmem allocator. |
1944 | */ | 1979 | */ |
1945 | if (max_hstate && parsed_hstate->order >= MAX_ORDER) | 1980 | if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) |
1946 | hugetlb_hstate_alloc_pages(parsed_hstate); | 1981 | hugetlb_hstate_alloc_pages(parsed_hstate); |
1947 | 1982 | ||
1948 | last_mhp = mhp; | 1983 | last_mhp = mhp; |
@@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) | |||
2308 | return 0; | 2343 | return 0; |
2309 | } | 2344 | } |
2310 | 2345 | ||
2311 | void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 2346 | void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, |
2312 | unsigned long end, struct page *ref_page) | 2347 | unsigned long start, unsigned long end, |
2348 | struct page *ref_page) | ||
2313 | { | 2349 | { |
2350 | int force_flush = 0; | ||
2314 | struct mm_struct *mm = vma->vm_mm; | 2351 | struct mm_struct *mm = vma->vm_mm; |
2315 | unsigned long address; | 2352 | unsigned long address; |
2316 | pte_t *ptep; | 2353 | pte_t *ptep; |
2317 | pte_t pte; | 2354 | pte_t pte; |
2318 | struct page *page; | 2355 | struct page *page; |
2319 | struct page *tmp; | ||
2320 | struct hstate *h = hstate_vma(vma); | 2356 | struct hstate *h = hstate_vma(vma); |
2321 | unsigned long sz = huge_page_size(h); | 2357 | unsigned long sz = huge_page_size(h); |
2322 | 2358 | ||
2323 | /* | ||
2324 | * A page gathering list, protected by per file i_mmap_mutex. The | ||
2325 | * lock is used to avoid list corruption from multiple unmapping | ||
2326 | * of the same page since we are using page->lru. | ||
2327 | */ | ||
2328 | LIST_HEAD(page_list); | ||
2329 | |||
2330 | WARN_ON(!is_vm_hugetlb_page(vma)); | 2359 | WARN_ON(!is_vm_hugetlb_page(vma)); |
2331 | BUG_ON(start & ~huge_page_mask(h)); | 2360 | BUG_ON(start & ~huge_page_mask(h)); |
2332 | BUG_ON(end & ~huge_page_mask(h)); | 2361 | BUG_ON(end & ~huge_page_mask(h)); |
2333 | 2362 | ||
2363 | tlb_start_vma(tlb, vma); | ||
2334 | mmu_notifier_invalidate_range_start(mm, start, end); | 2364 | mmu_notifier_invalidate_range_start(mm, start, end); |
2365 | again: | ||
2335 | spin_lock(&mm->page_table_lock); | 2366 | spin_lock(&mm->page_table_lock); |
2336 | for (address = start; address < end; address += sz) { | 2367 | for (address = start; address < end; address += sz) { |
2337 | ptep = huge_pte_offset(mm, address); | 2368 | ptep = huge_pte_offset(mm, address); |
@@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2370 | } | 2401 | } |
2371 | 2402 | ||
2372 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 2403 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
2404 | tlb_remove_tlb_entry(tlb, ptep, address); | ||
2373 | if (pte_dirty(pte)) | 2405 | if (pte_dirty(pte)) |
2374 | set_page_dirty(page); | 2406 | set_page_dirty(page); |
2375 | list_add(&page->lru, &page_list); | ||
2376 | 2407 | ||
2408 | page_remove_rmap(page); | ||
2409 | force_flush = !__tlb_remove_page(tlb, page); | ||
2410 | if (force_flush) | ||
2411 | break; | ||
2377 | /* Bail out after unmapping reference page if supplied */ | 2412 | /* Bail out after unmapping reference page if supplied */ |
2378 | if (ref_page) | 2413 | if (ref_page) |
2379 | break; | 2414 | break; |
2380 | } | 2415 | } |
2381 | flush_tlb_range(vma, start, end); | ||
2382 | spin_unlock(&mm->page_table_lock); | 2416 | spin_unlock(&mm->page_table_lock); |
2383 | mmu_notifier_invalidate_range_end(mm, start, end); | 2417 | /* |
2384 | list_for_each_entry_safe(page, tmp, &page_list, lru) { | 2418 | * mmu_gather ran out of room to batch pages, we break out of |
2385 | page_remove_rmap(page); | 2419 | * the PTE lock to avoid doing the potential expensive TLB invalidate |
2386 | list_del(&page->lru); | 2420 | * and page-free while holding it. |
2387 | put_page(page); | 2421 | */ |
2422 | if (force_flush) { | ||
2423 | force_flush = 0; | ||
2424 | tlb_flush_mmu(tlb); | ||
2425 | if (address < end && !ref_page) | ||
2426 | goto again; | ||
2388 | } | 2427 | } |
2428 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
2429 | tlb_end_vma(tlb, vma); | ||
2430 | } | ||
2431 | |||
2432 | void __unmap_hugepage_range_final(struct mmu_gather *tlb, | ||
2433 | struct vm_area_struct *vma, unsigned long start, | ||
2434 | unsigned long end, struct page *ref_page) | ||
2435 | { | ||
2436 | __unmap_hugepage_range(tlb, vma, start, end, ref_page); | ||
2437 | |||
2438 | /* | ||
2439 | * Clear this flag so that x86's huge_pmd_share page_table_shareable | ||
2440 | * test will fail on a vma being torn down, and not grab a page table | ||
2441 | * on its way out. We're lucky that the flag has such an appropriate | ||
2442 | * name, and can in fact be safely cleared here. We could clear it | ||
2443 | * before the __unmap_hugepage_range above, but all that's necessary | ||
2444 | * is to clear it before releasing the i_mmap_mutex. This works | ||
2445 | * because in the context this is called, the VMA is about to be | ||
2446 | * destroyed and the i_mmap_mutex is held. | ||
2447 | */ | ||
2448 | vma->vm_flags &= ~VM_MAYSHARE; | ||
2389 | } | 2449 | } |
2390 | 2450 | ||
2391 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 2451 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, |
2392 | unsigned long end, struct page *ref_page) | 2452 | unsigned long end, struct page *ref_page) |
2393 | { | 2453 | { |
2394 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2454 | struct mm_struct *mm; |
2395 | __unmap_hugepage_range(vma, start, end, ref_page); | 2455 | struct mmu_gather tlb; |
2396 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2456 | |
2457 | mm = vma->vm_mm; | ||
2458 | |||
2459 | tlb_gather_mmu(&tlb, mm, 0); | ||
2460 | __unmap_hugepage_range(&tlb, vma, start, end, ref_page); | ||
2461 | tlb_finish_mmu(&tlb, start, end); | ||
2397 | } | 2462 | } |
2398 | 2463 | ||
2399 | /* | 2464 | /* |
@@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2438 | * from the time of fork. This would look like data corruption | 2503 | * from the time of fork. This would look like data corruption |
2439 | */ | 2504 | */ |
2440 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | 2505 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) |
2441 | __unmap_hugepage_range(iter_vma, | 2506 | unmap_hugepage_range(iter_vma, address, |
2442 | address, address + huge_page_size(h), | 2507 | address + huge_page_size(h), page); |
2443 | page); | ||
2444 | } | 2508 | } |
2445 | mutex_unlock(&mapping->i_mmap_mutex); | 2509 | mutex_unlock(&mapping->i_mmap_mutex); |
2446 | 2510 | ||
@@ -2496,6 +2560,7 @@ retry_avoidcopy: | |||
2496 | new_page = alloc_huge_page(vma, address, outside_reserve); | 2560 | new_page = alloc_huge_page(vma, address, outside_reserve); |
2497 | 2561 | ||
2498 | if (IS_ERR(new_page)) { | 2562 | if (IS_ERR(new_page)) { |
2563 | long err = PTR_ERR(new_page); | ||
2499 | page_cache_release(old_page); | 2564 | page_cache_release(old_page); |
2500 | 2565 | ||
2501 | /* | 2566 | /* |
@@ -2524,7 +2589,10 @@ retry_avoidcopy: | |||
2524 | 2589 | ||
2525 | /* Caller expects lock to be held */ | 2590 | /* Caller expects lock to be held */ |
2526 | spin_lock(&mm->page_table_lock); | 2591 | spin_lock(&mm->page_table_lock); |
2527 | return -PTR_ERR(new_page); | 2592 | if (err == -ENOMEM) |
2593 | return VM_FAULT_OOM; | ||
2594 | else | ||
2595 | return VM_FAULT_SIGBUS; | ||
2528 | } | 2596 | } |
2529 | 2597 | ||
2530 | /* | 2598 | /* |
@@ -2642,7 +2710,11 @@ retry: | |||
2642 | goto out; | 2710 | goto out; |
2643 | page = alloc_huge_page(vma, address, 0); | 2711 | page = alloc_huge_page(vma, address, 0); |
2644 | if (IS_ERR(page)) { | 2712 | if (IS_ERR(page)) { |
2645 | ret = -PTR_ERR(page); | 2713 | ret = PTR_ERR(page); |
2714 | if (ret == -ENOMEM) | ||
2715 | ret = VM_FAULT_OOM; | ||
2716 | else | ||
2717 | ret = VM_FAULT_SIGBUS; | ||
2646 | goto out; | 2718 | goto out; |
2647 | } | 2719 | } |
2648 | clear_huge_page(page, address, pages_per_huge_page(h)); | 2720 | clear_huge_page(page, address, pages_per_huge_page(h)); |
@@ -2679,7 +2751,7 @@ retry: | |||
2679 | */ | 2751 | */ |
2680 | if (unlikely(PageHWPoison(page))) { | 2752 | if (unlikely(PageHWPoison(page))) { |
2681 | ret = VM_FAULT_HWPOISON | | 2753 | ret = VM_FAULT_HWPOISON | |
2682 | VM_FAULT_SET_HINDEX(h - hstates); | 2754 | VM_FAULT_SET_HINDEX(hstate_index(h)); |
2683 | goto backout_unlocked; | 2755 | goto backout_unlocked; |
2684 | } | 2756 | } |
2685 | } | 2757 | } |
@@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2752 | return 0; | 2824 | return 0; |
2753 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 2825 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) |
2754 | return VM_FAULT_HWPOISON_LARGE | | 2826 | return VM_FAULT_HWPOISON_LARGE | |
2755 | VM_FAULT_SET_HINDEX(h - hstates); | 2827 | VM_FAULT_SET_HINDEX(hstate_index(h)); |
2756 | } | 2828 | } |
2757 | 2829 | ||
2758 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); | 2830 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
@@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
2959 | } | 3031 | } |
2960 | } | 3032 | } |
2961 | spin_unlock(&mm->page_table_lock); | 3033 | spin_unlock(&mm->page_table_lock); |
2962 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3034 | /* |
2963 | 3035 | * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare | |
3036 | * may have cleared our pud entry and done put_page on the page table: | ||
3037 | * once we release i_mmap_mutex, another task can do the final put_page | ||
3038 | * and that page table be reused and filled with junk. | ||
3039 | */ | ||
2964 | flush_tlb_range(vma, start, end); | 3040 | flush_tlb_range(vma, start, end); |
3041 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | ||
2965 | } | 3042 | } |
2966 | 3043 | ||
2967 | int hugetlb_reserve_pages(struct inode *inode, | 3044 | int hugetlb_reserve_pages(struct inode *inode, |