diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 341 |
1 files changed, 194 insertions, 147 deletions
diff --git a/mm/memory.c b/mm/memory.c index f64cbf9baa36..ca8cac11bd2c 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -78,11 +78,9 @@ unsigned long num_physpages; | |||
78 | * and ZONE_HIGHMEM. | 78 | * and ZONE_HIGHMEM. |
79 | */ | 79 | */ |
80 | void * high_memory; | 80 | void * high_memory; |
81 | unsigned long vmalloc_earlyreserve; | ||
82 | 81 | ||
83 | EXPORT_SYMBOL(num_physpages); | 82 | EXPORT_SYMBOL(num_physpages); |
84 | EXPORT_SYMBOL(high_memory); | 83 | EXPORT_SYMBOL(high_memory); |
85 | EXPORT_SYMBOL(vmalloc_earlyreserve); | ||
86 | 84 | ||
87 | int randomize_va_space __read_mostly = 1; | 85 | int randomize_va_space __read_mostly = 1; |
88 | 86 | ||
@@ -1049,43 +1047,51 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1049 | if (pages) | 1047 | if (pages) |
1050 | foll_flags |= FOLL_GET; | 1048 | foll_flags |= FOLL_GET; |
1051 | if (!write && !(vma->vm_flags & VM_LOCKED) && | 1049 | if (!write && !(vma->vm_flags & VM_LOCKED) && |
1052 | (!vma->vm_ops || !vma->vm_ops->nopage)) | 1050 | (!vma->vm_ops || (!vma->vm_ops->nopage && |
1051 | !vma->vm_ops->fault))) | ||
1053 | foll_flags |= FOLL_ANON; | 1052 | foll_flags |= FOLL_ANON; |
1054 | 1053 | ||
1055 | do { | 1054 | do { |
1056 | struct page *page; | 1055 | struct page *page; |
1057 | 1056 | ||
1057 | /* | ||
1058 | * If tsk is ooming, cut off its access to large memory | ||
1059 | * allocations. It has a pending SIGKILL, but it can't | ||
1060 | * be processed until returning to user space. | ||
1061 | */ | ||
1062 | if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) | ||
1063 | return -ENOMEM; | ||
1064 | |||
1058 | if (write) | 1065 | if (write) |
1059 | foll_flags |= FOLL_WRITE; | 1066 | foll_flags |= FOLL_WRITE; |
1060 | 1067 | ||
1061 | cond_resched(); | 1068 | cond_resched(); |
1062 | while (!(page = follow_page(vma, start, foll_flags))) { | 1069 | while (!(page = follow_page(vma, start, foll_flags))) { |
1063 | int ret; | 1070 | int ret; |
1064 | ret = __handle_mm_fault(mm, vma, start, | 1071 | ret = handle_mm_fault(mm, vma, start, |
1065 | foll_flags & FOLL_WRITE); | 1072 | foll_flags & FOLL_WRITE); |
1073 | if (ret & VM_FAULT_ERROR) { | ||
1074 | if (ret & VM_FAULT_OOM) | ||
1075 | return i ? i : -ENOMEM; | ||
1076 | else if (ret & VM_FAULT_SIGBUS) | ||
1077 | return i ? i : -EFAULT; | ||
1078 | BUG(); | ||
1079 | } | ||
1080 | if (ret & VM_FAULT_MAJOR) | ||
1081 | tsk->maj_flt++; | ||
1082 | else | ||
1083 | tsk->min_flt++; | ||
1084 | |||
1066 | /* | 1085 | /* |
1067 | * The VM_FAULT_WRITE bit tells us that do_wp_page has | 1086 | * The VM_FAULT_WRITE bit tells us that |
1068 | * broken COW when necessary, even if maybe_mkwrite | 1087 | * do_wp_page has broken COW when necessary, |
1069 | * decided not to set pte_write. We can thus safely do | 1088 | * even if maybe_mkwrite decided not to set |
1070 | * subsequent page lookups as if they were reads. | 1089 | * pte_write. We can thus safely do subsequent |
1090 | * page lookups as if they were reads. | ||
1071 | */ | 1091 | */ |
1072 | if (ret & VM_FAULT_WRITE) | 1092 | if (ret & VM_FAULT_WRITE) |
1073 | foll_flags &= ~FOLL_WRITE; | 1093 | foll_flags &= ~FOLL_WRITE; |
1074 | 1094 | ||
1075 | switch (ret & ~VM_FAULT_WRITE) { | ||
1076 | case VM_FAULT_MINOR: | ||
1077 | tsk->min_flt++; | ||
1078 | break; | ||
1079 | case VM_FAULT_MAJOR: | ||
1080 | tsk->maj_flt++; | ||
1081 | break; | ||
1082 | case VM_FAULT_SIGBUS: | ||
1083 | return i ? i : -EFAULT; | ||
1084 | case VM_FAULT_OOM: | ||
1085 | return i ? i : -ENOMEM; | ||
1086 | default: | ||
1087 | BUG(); | ||
1088 | } | ||
1089 | cond_resched(); | 1095 | cond_resched(); |
1090 | } | 1096 | } |
1091 | if (pages) { | 1097 | if (pages) { |
@@ -1632,7 +1638,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1632 | { | 1638 | { |
1633 | struct page *old_page, *new_page; | 1639 | struct page *old_page, *new_page; |
1634 | pte_t entry; | 1640 | pte_t entry; |
1635 | int reuse = 0, ret = VM_FAULT_MINOR; | 1641 | int reuse = 0, ret = 0; |
1636 | struct page *dirty_page = NULL; | 1642 | struct page *dirty_page = NULL; |
1637 | 1643 | ||
1638 | old_page = vm_normal_page(vma, address, orig_pte); | 1644 | old_page = vm_normal_page(vma, address, orig_pte); |
@@ -1709,11 +1715,11 @@ gotten: | |||
1709 | if (unlikely(anon_vma_prepare(vma))) | 1715 | if (unlikely(anon_vma_prepare(vma))) |
1710 | goto oom; | 1716 | goto oom; |
1711 | if (old_page == ZERO_PAGE(address)) { | 1717 | if (old_page == ZERO_PAGE(address)) { |
1712 | new_page = alloc_zeroed_user_highpage(vma, address); | 1718 | new_page = alloc_zeroed_user_highpage_movable(vma, address); |
1713 | if (!new_page) | 1719 | if (!new_page) |
1714 | goto oom; | 1720 | goto oom; |
1715 | } else { | 1721 | } else { |
1716 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); | 1722 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
1717 | if (!new_page) | 1723 | if (!new_page) |
1718 | goto oom; | 1724 | goto oom; |
1719 | cow_user_page(new_page, old_page, address, vma); | 1725 | cow_user_page(new_page, old_page, address, vma); |
@@ -1759,6 +1765,15 @@ gotten: | |||
1759 | unlock: | 1765 | unlock: |
1760 | pte_unmap_unlock(page_table, ptl); | 1766 | pte_unmap_unlock(page_table, ptl); |
1761 | if (dirty_page) { | 1767 | if (dirty_page) { |
1768 | /* | ||
1769 | * Yes, Virginia, this is actually required to prevent a race | ||
1770 | * with clear_page_dirty_for_io() from clearing the page dirty | ||
1771 | * bit after it clear all dirty ptes, but before a racing | ||
1772 | * do_wp_page installs a dirty pte. | ||
1773 | * | ||
1774 | * do_no_page is protected similarly. | ||
1775 | */ | ||
1776 | wait_on_page_locked(dirty_page); | ||
1762 | set_page_dirty_balance(dirty_page); | 1777 | set_page_dirty_balance(dirty_page); |
1763 | put_page(dirty_page); | 1778 | put_page(dirty_page); |
1764 | } | 1779 | } |
@@ -1825,6 +1840,13 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma, | |||
1825 | unsigned long restart_addr; | 1840 | unsigned long restart_addr; |
1826 | int need_break; | 1841 | int need_break; |
1827 | 1842 | ||
1843 | /* | ||
1844 | * files that support invalidating or truncating portions of the | ||
1845 | * file from under mmaped areas must have their ->fault function | ||
1846 | * return a locked page (and set VM_FAULT_LOCKED in the return). | ||
1847 | * This provides synchronisation against concurrent unmapping here. | ||
1848 | */ | ||
1849 | |||
1828 | again: | 1850 | again: |
1829 | restart_addr = vma->vm_truncate_count; | 1851 | restart_addr = vma->vm_truncate_count; |
1830 | if (is_restart_addr(restart_addr) && start_addr < restart_addr) { | 1852 | if (is_restart_addr(restart_addr) && start_addr < restart_addr) { |
@@ -1953,17 +1975,8 @@ void unmap_mapping_range(struct address_space *mapping, | |||
1953 | 1975 | ||
1954 | spin_lock(&mapping->i_mmap_lock); | 1976 | spin_lock(&mapping->i_mmap_lock); |
1955 | 1977 | ||
1956 | /* serialize i_size write against truncate_count write */ | 1978 | /* Protect against endless unmapping loops */ |
1957 | smp_wmb(); | ||
1958 | /* Protect against page faults, and endless unmapping loops */ | ||
1959 | mapping->truncate_count++; | 1979 | mapping->truncate_count++; |
1960 | /* | ||
1961 | * For archs where spin_lock has inclusive semantics like ia64 | ||
1962 | * this smp_mb() will prevent to read pagetable contents | ||
1963 | * before the truncate_count increment is visible to | ||
1964 | * other cpus. | ||
1965 | */ | ||
1966 | smp_mb(); | ||
1967 | if (unlikely(is_restart_addr(mapping->truncate_count))) { | 1980 | if (unlikely(is_restart_addr(mapping->truncate_count))) { |
1968 | if (mapping->truncate_count == 0) | 1981 | if (mapping->truncate_count == 0) |
1969 | reset_vma_truncate_counts(mapping); | 1982 | reset_vma_truncate_counts(mapping); |
@@ -2002,8 +2015,18 @@ int vmtruncate(struct inode * inode, loff_t offset) | |||
2002 | if (IS_SWAPFILE(inode)) | 2015 | if (IS_SWAPFILE(inode)) |
2003 | goto out_busy; | 2016 | goto out_busy; |
2004 | i_size_write(inode, offset); | 2017 | i_size_write(inode, offset); |
2018 | |||
2019 | /* | ||
2020 | * unmap_mapping_range is called twice, first simply for efficiency | ||
2021 | * so that truncate_inode_pages does fewer single-page unmaps. However | ||
2022 | * after this first call, and before truncate_inode_pages finishes, | ||
2023 | * it is possible for private pages to be COWed, which remain after | ||
2024 | * truncate_inode_pages finishes, hence the second unmap_mapping_range | ||
2025 | * call must be made for correctness. | ||
2026 | */ | ||
2005 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | 2027 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); |
2006 | truncate_inode_pages(mapping, offset); | 2028 | truncate_inode_pages(mapping, offset); |
2029 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | ||
2007 | goto out_truncate; | 2030 | goto out_truncate; |
2008 | 2031 | ||
2009 | do_expand: | 2032 | do_expand: |
@@ -2043,6 +2066,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | |||
2043 | down_write(&inode->i_alloc_sem); | 2066 | down_write(&inode->i_alloc_sem); |
2044 | unmap_mapping_range(mapping, offset, (end - offset), 1); | 2067 | unmap_mapping_range(mapping, offset, (end - offset), 1); |
2045 | truncate_inode_pages_range(mapping, offset, end); | 2068 | truncate_inode_pages_range(mapping, offset, end); |
2069 | unmap_mapping_range(mapping, offset, (end - offset), 1); | ||
2046 | inode->i_op->truncate_range(inode, offset, end); | 2070 | inode->i_op->truncate_range(inode, offset, end); |
2047 | up_write(&inode->i_alloc_sem); | 2071 | up_write(&inode->i_alloc_sem); |
2048 | mutex_unlock(&inode->i_mutex); | 2072 | mutex_unlock(&inode->i_mutex); |
@@ -2124,7 +2148,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2124 | struct page *page; | 2148 | struct page *page; |
2125 | swp_entry_t entry; | 2149 | swp_entry_t entry; |
2126 | pte_t pte; | 2150 | pte_t pte; |
2127 | int ret = VM_FAULT_MINOR; | 2151 | int ret = 0; |
2128 | 2152 | ||
2129 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2153 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
2130 | goto out; | 2154 | goto out; |
@@ -2192,15 +2216,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2192 | unlock_page(page); | 2216 | unlock_page(page); |
2193 | 2217 | ||
2194 | if (write_access) { | 2218 | if (write_access) { |
2219 | /* XXX: We could OR the do_wp_page code with this one? */ | ||
2195 | if (do_wp_page(mm, vma, address, | 2220 | if (do_wp_page(mm, vma, address, |
2196 | page_table, pmd, ptl, pte) == VM_FAULT_OOM) | 2221 | page_table, pmd, ptl, pte) & VM_FAULT_OOM) |
2197 | ret = VM_FAULT_OOM; | 2222 | ret = VM_FAULT_OOM; |
2198 | goto out; | 2223 | goto out; |
2199 | } | 2224 | } |
2200 | 2225 | ||
2201 | /* No need to invalidate - it was non-present before */ | 2226 | /* No need to invalidate - it was non-present before */ |
2202 | update_mmu_cache(vma, address, pte); | 2227 | update_mmu_cache(vma, address, pte); |
2203 | lazy_mmu_prot_update(pte); | ||
2204 | unlock: | 2228 | unlock: |
2205 | pte_unmap_unlock(page_table, ptl); | 2229 | pte_unmap_unlock(page_table, ptl); |
2206 | out: | 2230 | out: |
@@ -2231,7 +2255,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2231 | 2255 | ||
2232 | if (unlikely(anon_vma_prepare(vma))) | 2256 | if (unlikely(anon_vma_prepare(vma))) |
2233 | goto oom; | 2257 | goto oom; |
2234 | page = alloc_zeroed_user_highpage(vma, address); | 2258 | page = alloc_zeroed_user_highpage_movable(vma, address); |
2235 | if (!page) | 2259 | if (!page) |
2236 | goto oom; | 2260 | goto oom; |
2237 | 2261 | ||
@@ -2265,7 +2289,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2265 | lazy_mmu_prot_update(entry); | 2289 | lazy_mmu_prot_update(entry); |
2266 | unlock: | 2290 | unlock: |
2267 | pte_unmap_unlock(page_table, ptl); | 2291 | pte_unmap_unlock(page_table, ptl); |
2268 | return VM_FAULT_MINOR; | 2292 | return 0; |
2269 | release: | 2293 | release: |
2270 | page_cache_release(page); | 2294 | page_cache_release(page); |
2271 | goto unlock; | 2295 | goto unlock; |
@@ -2274,10 +2298,10 @@ oom: | |||
2274 | } | 2298 | } |
2275 | 2299 | ||
2276 | /* | 2300 | /* |
2277 | * do_no_page() tries to create a new page mapping. It aggressively | 2301 | * __do_fault() tries to create a new page mapping. It aggressively |
2278 | * tries to share with existing pages, but makes a separate copy if | 2302 | * tries to share with existing pages, but makes a separate copy if |
2279 | * the "write_access" parameter is true in order to avoid the next | 2303 | * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid |
2280 | * page fault. | 2304 | * the next page fault. |
2281 | * | 2305 | * |
2282 | * As this is called only for pages that do not currently exist, we | 2306 | * As this is called only for pages that do not currently exist, we |
2283 | * do not need to flush old virtual caches or the TLB. | 2307 | * do not need to flush old virtual caches or the TLB. |
@@ -2286,89 +2310,100 @@ oom: | |||
2286 | * but allow concurrent faults), and pte mapped but not yet locked. | 2310 | * but allow concurrent faults), and pte mapped but not yet locked. |
2287 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2311 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2288 | */ | 2312 | */ |
2289 | static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2313 | static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2290 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2314 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2291 | int write_access) | 2315 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) |
2292 | { | 2316 | { |
2293 | spinlock_t *ptl; | 2317 | spinlock_t *ptl; |
2294 | struct page *new_page; | 2318 | struct page *page; |
2295 | struct address_space *mapping = NULL; | ||
2296 | pte_t entry; | 2319 | pte_t entry; |
2297 | unsigned int sequence = 0; | ||
2298 | int ret = VM_FAULT_MINOR; | ||
2299 | int anon = 0; | 2320 | int anon = 0; |
2300 | struct page *dirty_page = NULL; | 2321 | struct page *dirty_page = NULL; |
2322 | struct vm_fault vmf; | ||
2323 | int ret; | ||
2324 | |||
2325 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | ||
2326 | vmf.pgoff = pgoff; | ||
2327 | vmf.flags = flags; | ||
2328 | vmf.page = NULL; | ||
2301 | 2329 | ||
2302 | pte_unmap(page_table); | 2330 | pte_unmap(page_table); |
2303 | BUG_ON(vma->vm_flags & VM_PFNMAP); | 2331 | BUG_ON(vma->vm_flags & VM_PFNMAP); |
2304 | 2332 | ||
2305 | if (vma->vm_file) { | 2333 | if (likely(vma->vm_ops->fault)) { |
2306 | mapping = vma->vm_file->f_mapping; | 2334 | ret = vma->vm_ops->fault(vma, &vmf); |
2307 | sequence = mapping->truncate_count; | 2335 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) |
2308 | smp_rmb(); /* serializes i_size against truncate_count */ | 2336 | return ret; |
2337 | } else { | ||
2338 | /* Legacy ->nopage path */ | ||
2339 | ret = 0; | ||
2340 | vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); | ||
2341 | /* no page was available -- either SIGBUS or OOM */ | ||
2342 | if (unlikely(vmf.page == NOPAGE_SIGBUS)) | ||
2343 | return VM_FAULT_SIGBUS; | ||
2344 | else if (unlikely(vmf.page == NOPAGE_OOM)) | ||
2345 | return VM_FAULT_OOM; | ||
2309 | } | 2346 | } |
2310 | retry: | 2347 | |
2311 | new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); | ||
2312 | /* | 2348 | /* |
2313 | * No smp_rmb is needed here as long as there's a full | 2349 | * For consistency in subsequent calls, make the faulted page always |
2314 | * spin_lock/unlock sequence inside the ->nopage callback | 2350 | * locked. |
2315 | * (for the pagecache lookup) that acts as an implicit | ||
2316 | * smp_mb() and prevents the i_size read to happen | ||
2317 | * after the next truncate_count read. | ||
2318 | */ | 2351 | */ |
2319 | 2352 | if (unlikely(!(ret & VM_FAULT_LOCKED))) | |
2320 | /* no page was available -- either SIGBUS, OOM or REFAULT */ | 2353 | lock_page(vmf.page); |
2321 | if (unlikely(new_page == NOPAGE_SIGBUS)) | 2354 | else |
2322 | return VM_FAULT_SIGBUS; | 2355 | VM_BUG_ON(!PageLocked(vmf.page)); |
2323 | else if (unlikely(new_page == NOPAGE_OOM)) | ||
2324 | return VM_FAULT_OOM; | ||
2325 | else if (unlikely(new_page == NOPAGE_REFAULT)) | ||
2326 | return VM_FAULT_MINOR; | ||
2327 | 2356 | ||
2328 | /* | 2357 | /* |
2329 | * Should we do an early C-O-W break? | 2358 | * Should we do an early C-O-W break? |
2330 | */ | 2359 | */ |
2331 | if (write_access) { | 2360 | page = vmf.page; |
2361 | if (flags & FAULT_FLAG_WRITE) { | ||
2332 | if (!(vma->vm_flags & VM_SHARED)) { | 2362 | if (!(vma->vm_flags & VM_SHARED)) { |
2333 | struct page *page; | ||
2334 | |||
2335 | if (unlikely(anon_vma_prepare(vma))) | ||
2336 | goto oom; | ||
2337 | page = alloc_page_vma(GFP_HIGHUSER, vma, address); | ||
2338 | if (!page) | ||
2339 | goto oom; | ||
2340 | copy_user_highpage(page, new_page, address, vma); | ||
2341 | page_cache_release(new_page); | ||
2342 | new_page = page; | ||
2343 | anon = 1; | 2363 | anon = 1; |
2344 | 2364 | if (unlikely(anon_vma_prepare(vma))) { | |
2365 | ret = VM_FAULT_OOM; | ||
2366 | goto out; | ||
2367 | } | ||
2368 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, | ||
2369 | vma, address); | ||
2370 | if (!page) { | ||
2371 | ret = VM_FAULT_OOM; | ||
2372 | goto out; | ||
2373 | } | ||
2374 | copy_user_highpage(page, vmf.page, address, vma); | ||
2345 | } else { | 2375 | } else { |
2346 | /* if the page will be shareable, see if the backing | 2376 | /* |
2377 | * If the page will be shareable, see if the backing | ||
2347 | * address space wants to know that the page is about | 2378 | * address space wants to know that the page is about |
2348 | * to become writable */ | 2379 | * to become writable |
2349 | if (vma->vm_ops->page_mkwrite && | 2380 | */ |
2350 | vma->vm_ops->page_mkwrite(vma, new_page) < 0 | 2381 | if (vma->vm_ops->page_mkwrite) { |
2351 | ) { | 2382 | unlock_page(page); |
2352 | page_cache_release(new_page); | 2383 | if (vma->vm_ops->page_mkwrite(vma, page) < 0) { |
2353 | return VM_FAULT_SIGBUS; | 2384 | ret = VM_FAULT_SIGBUS; |
2385 | anon = 1; /* no anon but release vmf.page */ | ||
2386 | goto out_unlocked; | ||
2387 | } | ||
2388 | lock_page(page); | ||
2389 | /* | ||
2390 | * XXX: this is not quite right (racy vs | ||
2391 | * invalidate) to unlock and relock the page | ||
2392 | * like this, however a better fix requires | ||
2393 | * reworking page_mkwrite locking API, which | ||
2394 | * is better done later. | ||
2395 | */ | ||
2396 | if (!page->mapping) { | ||
2397 | ret = 0; | ||
2398 | anon = 1; /* no anon but release vmf.page */ | ||
2399 | goto out; | ||
2400 | } | ||
2354 | } | 2401 | } |
2355 | } | 2402 | } |
2403 | |||
2356 | } | 2404 | } |
2357 | 2405 | ||
2358 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2406 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2359 | /* | ||
2360 | * For a file-backed vma, someone could have truncated or otherwise | ||
2361 | * invalidated this page. If unmap_mapping_range got called, | ||
2362 | * retry getting the page. | ||
2363 | */ | ||
2364 | if (mapping && unlikely(sequence != mapping->truncate_count)) { | ||
2365 | pte_unmap_unlock(page_table, ptl); | ||
2366 | page_cache_release(new_page); | ||
2367 | cond_resched(); | ||
2368 | sequence = mapping->truncate_count; | ||
2369 | smp_rmb(); | ||
2370 | goto retry; | ||
2371 | } | ||
2372 | 2407 | ||
2373 | /* | 2408 | /* |
2374 | * This silly early PAGE_DIRTY setting removes a race | 2409 | * This silly early PAGE_DIRTY setting removes a race |
@@ -2381,45 +2416,63 @@ retry: | |||
2381 | * handle that later. | 2416 | * handle that later. |
2382 | */ | 2417 | */ |
2383 | /* Only go through if we didn't race with anybody else... */ | 2418 | /* Only go through if we didn't race with anybody else... */ |
2384 | if (pte_none(*page_table)) { | 2419 | if (likely(pte_same(*page_table, orig_pte))) { |
2385 | flush_icache_page(vma, new_page); | 2420 | flush_icache_page(vma, page); |
2386 | entry = mk_pte(new_page, vma->vm_page_prot); | 2421 | entry = mk_pte(page, vma->vm_page_prot); |
2387 | if (write_access) | 2422 | if (flags & FAULT_FLAG_WRITE) |
2388 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2423 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2389 | set_pte_at(mm, address, page_table, entry); | 2424 | set_pte_at(mm, address, page_table, entry); |
2390 | if (anon) { | 2425 | if (anon) { |
2391 | inc_mm_counter(mm, anon_rss); | 2426 | inc_mm_counter(mm, anon_rss); |
2392 | lru_cache_add_active(new_page); | 2427 | lru_cache_add_active(page); |
2393 | page_add_new_anon_rmap(new_page, vma, address); | 2428 | page_add_new_anon_rmap(page, vma, address); |
2394 | } else { | 2429 | } else { |
2395 | inc_mm_counter(mm, file_rss); | 2430 | inc_mm_counter(mm, file_rss); |
2396 | page_add_file_rmap(new_page); | 2431 | page_add_file_rmap(page); |
2397 | if (write_access) { | 2432 | if (flags & FAULT_FLAG_WRITE) { |
2398 | dirty_page = new_page; | 2433 | dirty_page = page; |
2399 | get_page(dirty_page); | 2434 | get_page(dirty_page); |
2400 | } | 2435 | } |
2401 | } | 2436 | } |
2437 | |||
2438 | /* no need to invalidate: a not-present page won't be cached */ | ||
2439 | update_mmu_cache(vma, address, entry); | ||
2440 | lazy_mmu_prot_update(entry); | ||
2402 | } else { | 2441 | } else { |
2403 | /* One of our sibling threads was faster, back out. */ | 2442 | if (anon) |
2404 | page_cache_release(new_page); | 2443 | page_cache_release(page); |
2405 | goto unlock; | 2444 | else |
2445 | anon = 1; /* no anon but release faulted_page */ | ||
2406 | } | 2446 | } |
2407 | 2447 | ||
2408 | /* no need to invalidate: a not-present page shouldn't be cached */ | ||
2409 | update_mmu_cache(vma, address, entry); | ||
2410 | lazy_mmu_prot_update(entry); | ||
2411 | unlock: | ||
2412 | pte_unmap_unlock(page_table, ptl); | 2448 | pte_unmap_unlock(page_table, ptl); |
2413 | if (dirty_page) { | 2449 | |
2450 | out: | ||
2451 | unlock_page(vmf.page); | ||
2452 | out_unlocked: | ||
2453 | if (anon) | ||
2454 | page_cache_release(vmf.page); | ||
2455 | else if (dirty_page) { | ||
2414 | set_page_dirty_balance(dirty_page); | 2456 | set_page_dirty_balance(dirty_page); |
2415 | put_page(dirty_page); | 2457 | put_page(dirty_page); |
2416 | } | 2458 | } |
2459 | |||
2417 | return ret; | 2460 | return ret; |
2418 | oom: | ||
2419 | page_cache_release(new_page); | ||
2420 | return VM_FAULT_OOM; | ||
2421 | } | 2461 | } |
2422 | 2462 | ||
2463 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | ||
2464 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
2465 | int write_access, pte_t orig_pte) | ||
2466 | { | ||
2467 | pgoff_t pgoff = (((address & PAGE_MASK) | ||
2468 | - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; | ||
2469 | unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); | ||
2470 | |||
2471 | return __do_fault(mm, vma, address, page_table, pmd, pgoff, | ||
2472 | flags, orig_pte); | ||
2473 | } | ||
2474 | |||
2475 | |||
2423 | /* | 2476 | /* |
2424 | * do_no_pfn() tries to create a new page mapping for a page without | 2477 | * do_no_pfn() tries to create a new page mapping for a page without |
2425 | * a struct_page backing it | 2478 | * a struct_page backing it |
@@ -2443,7 +2496,6 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2443 | spinlock_t *ptl; | 2496 | spinlock_t *ptl; |
2444 | pte_t entry; | 2497 | pte_t entry; |
2445 | unsigned long pfn; | 2498 | unsigned long pfn; |
2446 | int ret = VM_FAULT_MINOR; | ||
2447 | 2499 | ||
2448 | pte_unmap(page_table); | 2500 | pte_unmap(page_table); |
2449 | BUG_ON(!(vma->vm_flags & VM_PFNMAP)); | 2501 | BUG_ON(!(vma->vm_flags & VM_PFNMAP)); |
@@ -2455,7 +2507,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2455 | else if (unlikely(pfn == NOPFN_SIGBUS)) | 2507 | else if (unlikely(pfn == NOPFN_SIGBUS)) |
2456 | return VM_FAULT_SIGBUS; | 2508 | return VM_FAULT_SIGBUS; |
2457 | else if (unlikely(pfn == NOPFN_REFAULT)) | 2509 | else if (unlikely(pfn == NOPFN_REFAULT)) |
2458 | return VM_FAULT_MINOR; | 2510 | return 0; |
2459 | 2511 | ||
2460 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2512 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
2461 | 2513 | ||
@@ -2467,7 +2519,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2467 | set_pte_at(mm, address, page_table, entry); | 2519 | set_pte_at(mm, address, page_table, entry); |
2468 | } | 2520 | } |
2469 | pte_unmap_unlock(page_table, ptl); | 2521 | pte_unmap_unlock(page_table, ptl); |
2470 | return ret; | 2522 | return 0; |
2471 | } | 2523 | } |
2472 | 2524 | ||
2473 | /* | 2525 | /* |
@@ -2479,33 +2531,30 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2479 | * but allow concurrent faults), and pte mapped but not yet locked. | 2531 | * but allow concurrent faults), and pte mapped but not yet locked. |
2480 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2532 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2481 | */ | 2533 | */ |
2482 | static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2534 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2483 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2535 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2484 | int write_access, pte_t orig_pte) | 2536 | int write_access, pte_t orig_pte) |
2485 | { | 2537 | { |
2538 | unsigned int flags = FAULT_FLAG_NONLINEAR | | ||
2539 | (write_access ? FAULT_FLAG_WRITE : 0); | ||
2486 | pgoff_t pgoff; | 2540 | pgoff_t pgoff; |
2487 | int err; | ||
2488 | 2541 | ||
2489 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2542 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
2490 | return VM_FAULT_MINOR; | 2543 | return 0; |
2491 | 2544 | ||
2492 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { | 2545 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || |
2546 | !(vma->vm_flags & VM_CAN_NONLINEAR))) { | ||
2493 | /* | 2547 | /* |
2494 | * Page table corrupted: show pte and kill process. | 2548 | * Page table corrupted: show pte and kill process. |
2495 | */ | 2549 | */ |
2496 | print_bad_pte(vma, orig_pte, address); | 2550 | print_bad_pte(vma, orig_pte, address); |
2497 | return VM_FAULT_OOM; | 2551 | return VM_FAULT_OOM; |
2498 | } | 2552 | } |
2499 | /* We can then assume vm->vm_ops && vma->vm_ops->populate */ | ||
2500 | 2553 | ||
2501 | pgoff = pte_to_pgoff(orig_pte); | 2554 | pgoff = pte_to_pgoff(orig_pte); |
2502 | err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, | 2555 | |
2503 | vma->vm_page_prot, pgoff, 0); | 2556 | return __do_fault(mm, vma, address, page_table, pmd, pgoff, |
2504 | if (err == -ENOMEM) | 2557 | flags, orig_pte); |
2505 | return VM_FAULT_OOM; | ||
2506 | if (err) | ||
2507 | return VM_FAULT_SIGBUS; | ||
2508 | return VM_FAULT_MAJOR; | ||
2509 | } | 2558 | } |
2510 | 2559 | ||
2511 | /* | 2560 | /* |
@@ -2532,10 +2581,9 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2532 | if (!pte_present(entry)) { | 2581 | if (!pte_present(entry)) { |
2533 | if (pte_none(entry)) { | 2582 | if (pte_none(entry)) { |
2534 | if (vma->vm_ops) { | 2583 | if (vma->vm_ops) { |
2535 | if (vma->vm_ops->nopage) | 2584 | if (vma->vm_ops->fault || vma->vm_ops->nopage) |
2536 | return do_no_page(mm, vma, address, | 2585 | return do_linear_fault(mm, vma, address, |
2537 | pte, pmd, | 2586 | pte, pmd, write_access, entry); |
2538 | write_access); | ||
2539 | if (unlikely(vma->vm_ops->nopfn)) | 2587 | if (unlikely(vma->vm_ops->nopfn)) |
2540 | return do_no_pfn(mm, vma, address, pte, | 2588 | return do_no_pfn(mm, vma, address, pte, |
2541 | pmd, write_access); | 2589 | pmd, write_access); |
@@ -2544,7 +2592,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2544 | pte, pmd, write_access); | 2592 | pte, pmd, write_access); |
2545 | } | 2593 | } |
2546 | if (pte_file(entry)) | 2594 | if (pte_file(entry)) |
2547 | return do_file_page(mm, vma, address, | 2595 | return do_nonlinear_fault(mm, vma, address, |
2548 | pte, pmd, write_access, entry); | 2596 | pte, pmd, write_access, entry); |
2549 | return do_swap_page(mm, vma, address, | 2597 | return do_swap_page(mm, vma, address, |
2550 | pte, pmd, write_access, entry); | 2598 | pte, pmd, write_access, entry); |
@@ -2576,13 +2624,13 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2576 | } | 2624 | } |
2577 | unlock: | 2625 | unlock: |
2578 | pte_unmap_unlock(pte, ptl); | 2626 | pte_unmap_unlock(pte, ptl); |
2579 | return VM_FAULT_MINOR; | 2627 | return 0; |
2580 | } | 2628 | } |
2581 | 2629 | ||
2582 | /* | 2630 | /* |
2583 | * By the time we get here, we already hold the mm semaphore | 2631 | * By the time we get here, we already hold the mm semaphore |
2584 | */ | 2632 | */ |
2585 | int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2633 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2586 | unsigned long address, int write_access) | 2634 | unsigned long address, int write_access) |
2587 | { | 2635 | { |
2588 | pgd_t *pgd; | 2636 | pgd_t *pgd; |
@@ -2611,8 +2659,6 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2611 | return handle_pte_fault(mm, vma, address, pte, pmd, write_access); | 2659 | return handle_pte_fault(mm, vma, address, pte, pmd, write_access); |
2612 | } | 2660 | } |
2613 | 2661 | ||
2614 | EXPORT_SYMBOL_GPL(__handle_mm_fault); | ||
2615 | |||
2616 | #ifndef __PAGETABLE_PUD_FOLDED | 2662 | #ifndef __PAGETABLE_PUD_FOLDED |
2617 | /* | 2663 | /* |
2618 | * Allocate page upper directory. | 2664 | * Allocate page upper directory. |
@@ -2673,7 +2719,7 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
2673 | write = (vma->vm_flags & VM_WRITE) != 0; | 2719 | write = (vma->vm_flags & VM_WRITE) != 0; |
2674 | BUG_ON(addr >= end); | 2720 | BUG_ON(addr >= end); |
2675 | BUG_ON(end > vma->vm_end); | 2721 | BUG_ON(end > vma->vm_end); |
2676 | len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; | 2722 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; |
2677 | ret = get_user_pages(current, current->mm, addr, | 2723 | ret = get_user_pages(current, current->mm, addr, |
2678 | len, write, 0, NULL, NULL); | 2724 | len, write, 0, NULL, NULL); |
2679 | if (ret < 0) | 2725 | if (ret < 0) |
@@ -2817,3 +2863,4 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
2817 | 2863 | ||
2818 | return buf - old_buf; | 2864 | return buf - old_buf; |
2819 | } | 2865 | } |
2866 | EXPORT_SYMBOL_GPL(access_process_vm); | ||