diff options
Diffstat (limited to 'mm/hugetlb.c')
| -rw-r--r-- | mm/hugetlb.c | 124 |
1 files changed, 99 insertions, 25 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 67a71191136e..6058b53dcb89 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -7,6 +7,7 @@ | |||
| 7 | #include <linux/init.h> | 7 | #include <linux/init.h> |
| 8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
| 9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
| 10 | #include <linux/seq_file.h> | ||
| 10 | #include <linux/sysctl.h> | 11 | #include <linux/sysctl.h> |
| 11 | #include <linux/highmem.h> | 12 | #include <linux/highmem.h> |
| 12 | #include <linux/mmu_notifier.h> | 13 | #include <linux/mmu_notifier.h> |
| @@ -262,7 +263,7 @@ struct resv_map { | |||
| 262 | struct list_head regions; | 263 | struct list_head regions; |
| 263 | }; | 264 | }; |
| 264 | 265 | ||
| 265 | struct resv_map *resv_map_alloc(void) | 266 | static struct resv_map *resv_map_alloc(void) |
| 266 | { | 267 | { |
| 267 | struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); | 268 | struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); |
| 268 | if (!resv_map) | 269 | if (!resv_map) |
| @@ -274,7 +275,7 @@ struct resv_map *resv_map_alloc(void) | |||
| 274 | return resv_map; | 275 | return resv_map; |
| 275 | } | 276 | } |
| 276 | 277 | ||
| 277 | void resv_map_release(struct kref *ref) | 278 | static void resv_map_release(struct kref *ref) |
| 278 | { | 279 | { |
| 279 | struct resv_map *resv_map = container_of(ref, struct resv_map, refs); | 280 | struct resv_map *resv_map = container_of(ref, struct resv_map, refs); |
| 280 | 281 | ||
| @@ -289,7 +290,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) | |||
| 289 | if (!(vma->vm_flags & VM_SHARED)) | 290 | if (!(vma->vm_flags & VM_SHARED)) |
| 290 | return (struct resv_map *)(get_vma_private_data(vma) & | 291 | return (struct resv_map *)(get_vma_private_data(vma) & |
| 291 | ~HPAGE_RESV_MASK); | 292 | ~HPAGE_RESV_MASK); |
| 292 | return 0; | 293 | return NULL; |
| 293 | } | 294 | } |
| 294 | 295 | ||
| 295 | static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) | 296 | static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) |
| @@ -353,11 +354,26 @@ static int vma_has_reserves(struct vm_area_struct *vma) | |||
| 353 | return 0; | 354 | return 0; |
| 354 | } | 355 | } |
| 355 | 356 | ||
| 357 | static void clear_gigantic_page(struct page *page, | ||
| 358 | unsigned long addr, unsigned long sz) | ||
| 359 | { | ||
| 360 | int i; | ||
| 361 | struct page *p = page; | ||
| 362 | |||
| 363 | might_sleep(); | ||
| 364 | for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { | ||
| 365 | cond_resched(); | ||
| 366 | clear_user_highpage(p, addr + i * PAGE_SIZE); | ||
| 367 | } | ||
| 368 | } | ||
| 356 | static void clear_huge_page(struct page *page, | 369 | static void clear_huge_page(struct page *page, |
| 357 | unsigned long addr, unsigned long sz) | 370 | unsigned long addr, unsigned long sz) |
| 358 | { | 371 | { |
| 359 | int i; | 372 | int i; |
| 360 | 373 | ||
| 374 | if (unlikely(sz > MAX_ORDER_NR_PAGES)) | ||
| 375 | return clear_gigantic_page(page, addr, sz); | ||
| 376 | |||
| 361 | might_sleep(); | 377 | might_sleep(); |
| 362 | for (i = 0; i < sz/PAGE_SIZE; i++) { | 378 | for (i = 0; i < sz/PAGE_SIZE; i++) { |
| 363 | cond_resched(); | 379 | cond_resched(); |
| @@ -365,12 +381,32 @@ static void clear_huge_page(struct page *page, | |||
| 365 | } | 381 | } |
| 366 | } | 382 | } |
| 367 | 383 | ||
| 384 | static void copy_gigantic_page(struct page *dst, struct page *src, | ||
| 385 | unsigned long addr, struct vm_area_struct *vma) | ||
| 386 | { | ||
| 387 | int i; | ||
| 388 | struct hstate *h = hstate_vma(vma); | ||
| 389 | struct page *dst_base = dst; | ||
| 390 | struct page *src_base = src; | ||
| 391 | might_sleep(); | ||
| 392 | for (i = 0; i < pages_per_huge_page(h); ) { | ||
| 393 | cond_resched(); | ||
| 394 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | ||
| 395 | |||
| 396 | i++; | ||
| 397 | dst = mem_map_next(dst, dst_base, i); | ||
| 398 | src = mem_map_next(src, src_base, i); | ||
| 399 | } | ||
| 400 | } | ||
| 368 | static void copy_huge_page(struct page *dst, struct page *src, | 401 | static void copy_huge_page(struct page *dst, struct page *src, |
| 369 | unsigned long addr, struct vm_area_struct *vma) | 402 | unsigned long addr, struct vm_area_struct *vma) |
| 370 | { | 403 | { |
| 371 | int i; | 404 | int i; |
| 372 | struct hstate *h = hstate_vma(vma); | 405 | struct hstate *h = hstate_vma(vma); |
| 373 | 406 | ||
| 407 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) | ||
| 408 | return copy_gigantic_page(dst, src, addr, vma); | ||
| 409 | |||
| 374 | might_sleep(); | 410 | might_sleep(); |
| 375 | for (i = 0; i < pages_per_huge_page(h); i++) { | 411 | for (i = 0; i < pages_per_huge_page(h); i++) { |
| 376 | cond_resched(); | 412 | cond_resched(); |
| @@ -455,6 +491,8 @@ static void update_and_free_page(struct hstate *h, struct page *page) | |||
| 455 | { | 491 | { |
| 456 | int i; | 492 | int i; |
| 457 | 493 | ||
| 494 | VM_BUG_ON(h->order >= MAX_ORDER); | ||
| 495 | |||
| 458 | h->nr_huge_pages--; | 496 | h->nr_huge_pages--; |
| 459 | h->nr_huge_pages_node[page_to_nid(page)]--; | 497 | h->nr_huge_pages_node[page_to_nid(page)]--; |
| 460 | for (i = 0; i < pages_per_huge_page(h); i++) { | 498 | for (i = 0; i < pages_per_huge_page(h); i++) { |
| @@ -969,6 +1007,14 @@ found: | |||
| 969 | return 1; | 1007 | return 1; |
| 970 | } | 1008 | } |
| 971 | 1009 | ||
| 1010 | static void prep_compound_huge_page(struct page *page, int order) | ||
| 1011 | { | ||
| 1012 | if (unlikely(order > (MAX_ORDER - 1))) | ||
| 1013 | prep_compound_gigantic_page(page, order); | ||
| 1014 | else | ||
| 1015 | prep_compound_page(page, order); | ||
| 1016 | } | ||
| 1017 | |||
| 972 | /* Put bootmem huge pages into the standard lists after mem_map is up */ | 1018 | /* Put bootmem huge pages into the standard lists after mem_map is up */ |
| 973 | static void __init gather_bootmem_prealloc(void) | 1019 | static void __init gather_bootmem_prealloc(void) |
| 974 | { | 1020 | { |
| @@ -979,7 +1025,7 @@ static void __init gather_bootmem_prealloc(void) | |||
| 979 | struct hstate *h = m->hstate; | 1025 | struct hstate *h = m->hstate; |
| 980 | __ClearPageReserved(page); | 1026 | __ClearPageReserved(page); |
| 981 | WARN_ON(page_count(page) != 1); | 1027 | WARN_ON(page_count(page) != 1); |
| 982 | prep_compound_page(page, h->order); | 1028 | prep_compound_huge_page(page, h->order); |
| 983 | prep_new_huge_page(h, page, page_to_nid(page)); | 1029 | prep_new_huge_page(h, page, page_to_nid(page)); |
| 984 | } | 1030 | } |
| 985 | } | 1031 | } |
| @@ -1455,15 +1501,15 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
| 1455 | 1501 | ||
| 1456 | #endif /* CONFIG_SYSCTL */ | 1502 | #endif /* CONFIG_SYSCTL */ |
| 1457 | 1503 | ||
| 1458 | int hugetlb_report_meminfo(char *buf) | 1504 | void hugetlb_report_meminfo(struct seq_file *m) |
| 1459 | { | 1505 | { |
| 1460 | struct hstate *h = &default_hstate; | 1506 | struct hstate *h = &default_hstate; |
| 1461 | return sprintf(buf, | 1507 | seq_printf(m, |
| 1462 | "HugePages_Total: %5lu\n" | 1508 | "HugePages_Total: %5lu\n" |
| 1463 | "HugePages_Free: %5lu\n" | 1509 | "HugePages_Free: %5lu\n" |
| 1464 | "HugePages_Rsvd: %5lu\n" | 1510 | "HugePages_Rsvd: %5lu\n" |
| 1465 | "HugePages_Surp: %5lu\n" | 1511 | "HugePages_Surp: %5lu\n" |
| 1466 | "Hugepagesize: %5lu kB\n", | 1512 | "Hugepagesize: %8lu kB\n", |
| 1467 | h->nr_huge_pages, | 1513 | h->nr_huge_pages, |
| 1468 | h->free_huge_pages, | 1514 | h->free_huge_pages, |
| 1469 | h->resv_huge_pages, | 1515 | h->resv_huge_pages, |
| @@ -1747,11 +1793,10 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
| 1747 | * from other VMAs and let the children be SIGKILLed if they are faulting the | 1793 | * from other VMAs and let the children be SIGKILLed if they are faulting the |
| 1748 | * same region. | 1794 | * same region. |
| 1749 | */ | 1795 | */ |
| 1750 | int unmap_ref_private(struct mm_struct *mm, | 1796 | static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, |
| 1751 | struct vm_area_struct *vma, | 1797 | struct page *page, unsigned long address) |
| 1752 | struct page *page, | ||
| 1753 | unsigned long address) | ||
| 1754 | { | 1798 | { |
| 1799 | struct hstate *h = hstate_vma(vma); | ||
| 1755 | struct vm_area_struct *iter_vma; | 1800 | struct vm_area_struct *iter_vma; |
| 1756 | struct address_space *mapping; | 1801 | struct address_space *mapping; |
| 1757 | struct prio_tree_iter iter; | 1802 | struct prio_tree_iter iter; |
| @@ -1761,7 +1806,7 @@ int unmap_ref_private(struct mm_struct *mm, | |||
| 1761 | * vm_pgoff is in PAGE_SIZE units, hence the different calculation | 1806 | * vm_pgoff is in PAGE_SIZE units, hence the different calculation |
| 1762 | * from page cache lookup which is in HPAGE_SIZE units. | 1807 | * from page cache lookup which is in HPAGE_SIZE units. |
| 1763 | */ | 1808 | */ |
| 1764 | address = address & huge_page_mask(hstate_vma(vma)); | 1809 | address = address & huge_page_mask(h); |
| 1765 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) | 1810 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) |
| 1766 | + (vma->vm_pgoff >> PAGE_SHIFT); | 1811 | + (vma->vm_pgoff >> PAGE_SHIFT); |
| 1767 | mapping = (struct address_space *)page_private(page); | 1812 | mapping = (struct address_space *)page_private(page); |
| @@ -1780,7 +1825,7 @@ int unmap_ref_private(struct mm_struct *mm, | |||
| 1780 | */ | 1825 | */ |
| 1781 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | 1826 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) |
| 1782 | unmap_hugepage_range(iter_vma, | 1827 | unmap_hugepage_range(iter_vma, |
| 1783 | address, address + HPAGE_SIZE, | 1828 | address, address + huge_page_size(h), |
| 1784 | page); | 1829 | page); |
| 1785 | } | 1830 | } |
| 1786 | 1831 | ||
| @@ -2008,7 +2053,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2008 | entry = huge_ptep_get(ptep); | 2053 | entry = huge_ptep_get(ptep); |
| 2009 | if (huge_pte_none(entry)) { | 2054 | if (huge_pte_none(entry)) { |
| 2010 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); | 2055 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); |
| 2011 | goto out_unlock; | 2056 | goto out_mutex; |
| 2012 | } | 2057 | } |
| 2013 | 2058 | ||
| 2014 | ret = 0; | 2059 | ret = 0; |
| @@ -2024,7 +2069,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2024 | if (write_access && !pte_write(entry)) { | 2069 | if (write_access && !pte_write(entry)) { |
| 2025 | if (vma_needs_reservation(h, vma, address) < 0) { | 2070 | if (vma_needs_reservation(h, vma, address) < 0) { |
| 2026 | ret = VM_FAULT_OOM; | 2071 | ret = VM_FAULT_OOM; |
| 2027 | goto out_unlock; | 2072 | goto out_mutex; |
| 2028 | } | 2073 | } |
| 2029 | 2074 | ||
| 2030 | if (!(vma->vm_flags & VM_SHARED)) | 2075 | if (!(vma->vm_flags & VM_SHARED)) |
| @@ -2034,10 +2079,23 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2034 | 2079 | ||
| 2035 | spin_lock(&mm->page_table_lock); | 2080 | spin_lock(&mm->page_table_lock); |
| 2036 | /* Check for a racing update before calling hugetlb_cow */ | 2081 | /* Check for a racing update before calling hugetlb_cow */ |
| 2037 | if (likely(pte_same(entry, huge_ptep_get(ptep)))) | 2082 | if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) |
| 2038 | if (write_access && !pte_write(entry)) | 2083 | goto out_page_table_lock; |
| 2084 | |||
| 2085 | |||
| 2086 | if (write_access) { | ||
| 2087 | if (!pte_write(entry)) { | ||
| 2039 | ret = hugetlb_cow(mm, vma, address, ptep, entry, | 2088 | ret = hugetlb_cow(mm, vma, address, ptep, entry, |
| 2040 | pagecache_page); | 2089 | pagecache_page); |
| 2090 | goto out_page_table_lock; | ||
| 2091 | } | ||
| 2092 | entry = pte_mkdirty(entry); | ||
| 2093 | } | ||
| 2094 | entry = pte_mkyoung(entry); | ||
| 2095 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) | ||
| 2096 | update_mmu_cache(vma, address, entry); | ||
| 2097 | |||
| 2098 | out_page_table_lock: | ||
| 2041 | spin_unlock(&mm->page_table_lock); | 2099 | spin_unlock(&mm->page_table_lock); |
| 2042 | 2100 | ||
| 2043 | if (pagecache_page) { | 2101 | if (pagecache_page) { |
| @@ -2045,7 +2103,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2045 | put_page(pagecache_page); | 2103 | put_page(pagecache_page); |
| 2046 | } | 2104 | } |
| 2047 | 2105 | ||
| 2048 | out_unlock: | 2106 | out_mutex: |
| 2049 | mutex_unlock(&hugetlb_instantiation_mutex); | 2107 | mutex_unlock(&hugetlb_instantiation_mutex); |
| 2050 | 2108 | ||
| 2051 | return ret; | 2109 | return ret; |
| @@ -2060,6 +2118,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, | |||
| 2060 | return NULL; | 2118 | return NULL; |
| 2061 | } | 2119 | } |
| 2062 | 2120 | ||
| 2121 | static int huge_zeropage_ok(pte_t *ptep, int write, int shared) | ||
| 2122 | { | ||
| 2123 | if (!ptep || write || shared) | ||
| 2124 | return 0; | ||
| 2125 | else | ||
| 2126 | return huge_pte_none(huge_ptep_get(ptep)); | ||
| 2127 | } | ||
| 2128 | |||
| 2063 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2129 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 2064 | struct page **pages, struct vm_area_struct **vmas, | 2130 | struct page **pages, struct vm_area_struct **vmas, |
| 2065 | unsigned long *position, int *length, int i, | 2131 | unsigned long *position, int *length, int i, |
| @@ -2069,6 +2135,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2069 | unsigned long vaddr = *position; | 2135 | unsigned long vaddr = *position; |
| 2070 | int remainder = *length; | 2136 | int remainder = *length; |
| 2071 | struct hstate *h = hstate_vma(vma); | 2137 | struct hstate *h = hstate_vma(vma); |
| 2138 | int zeropage_ok = 0; | ||
| 2139 | int shared = vma->vm_flags & VM_SHARED; | ||
| 2072 | 2140 | ||
| 2073 | spin_lock(&mm->page_table_lock); | 2141 | spin_lock(&mm->page_table_lock); |
| 2074 | while (vaddr < vma->vm_end && remainder) { | 2142 | while (vaddr < vma->vm_end && remainder) { |
| @@ -2081,8 +2149,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2081 | * first, for the page indexing below to work. | 2149 | * first, for the page indexing below to work. |
| 2082 | */ | 2150 | */ |
| 2083 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); | 2151 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); |
| 2152 | if (huge_zeropage_ok(pte, write, shared)) | ||
| 2153 | zeropage_ok = 1; | ||
| 2084 | 2154 | ||
| 2085 | if (!pte || huge_pte_none(huge_ptep_get(pte)) || | 2155 | if (!pte || |
| 2156 | (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || | ||
| 2086 | (write && !pte_write(huge_ptep_get(pte)))) { | 2157 | (write && !pte_write(huge_ptep_get(pte)))) { |
| 2087 | int ret; | 2158 | int ret; |
| 2088 | 2159 | ||
| @@ -2102,8 +2173,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2102 | page = pte_page(huge_ptep_get(pte)); | 2173 | page = pte_page(huge_ptep_get(pte)); |
| 2103 | same_page: | 2174 | same_page: |
| 2104 | if (pages) { | 2175 | if (pages) { |
| 2105 | get_page(page); | 2176 | if (zeropage_ok) |
| 2106 | pages[i] = page + pfn_offset; | 2177 | pages[i] = ZERO_PAGE(0); |
| 2178 | else | ||
| 2179 | pages[i] = mem_map_offset(page, pfn_offset); | ||
| 2180 | get_page(pages[i]); | ||
| 2107 | } | 2181 | } |
| 2108 | 2182 | ||
| 2109 | if (vmas) | 2183 | if (vmas) |
