aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c341
1 files changed, 194 insertions, 147 deletions
diff --git a/mm/memory.c b/mm/memory.c
index f64cbf9baa36..ca8cac11bd2c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -78,11 +78,9 @@ unsigned long num_physpages;
78 * and ZONE_HIGHMEM. 78 * and ZONE_HIGHMEM.
79 */ 79 */
80void * high_memory; 80void * high_memory;
81unsigned long vmalloc_earlyreserve;
82 81
83EXPORT_SYMBOL(num_physpages); 82EXPORT_SYMBOL(num_physpages);
84EXPORT_SYMBOL(high_memory); 83EXPORT_SYMBOL(high_memory);
85EXPORT_SYMBOL(vmalloc_earlyreserve);
86 84
87int randomize_va_space __read_mostly = 1; 85int randomize_va_space __read_mostly = 1;
88 86
@@ -1049,43 +1047,51 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1049 if (pages) 1047 if (pages)
1050 foll_flags |= FOLL_GET; 1048 foll_flags |= FOLL_GET;
1051 if (!write && !(vma->vm_flags & VM_LOCKED) && 1049 if (!write && !(vma->vm_flags & VM_LOCKED) &&
1052 (!vma->vm_ops || !vma->vm_ops->nopage)) 1050 (!vma->vm_ops || (!vma->vm_ops->nopage &&
1051 !vma->vm_ops->fault)))
1053 foll_flags |= FOLL_ANON; 1052 foll_flags |= FOLL_ANON;
1054 1053
1055 do { 1054 do {
1056 struct page *page; 1055 struct page *page;
1057 1056
1057 /*
1058 * If tsk is ooming, cut off its access to large memory
1059 * allocations. It has a pending SIGKILL, but it can't
1060 * be processed until returning to user space.
1061 */
1062 if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
1063 return -ENOMEM;
1064
1058 if (write) 1065 if (write)
1059 foll_flags |= FOLL_WRITE; 1066 foll_flags |= FOLL_WRITE;
1060 1067
1061 cond_resched(); 1068 cond_resched();
1062 while (!(page = follow_page(vma, start, foll_flags))) { 1069 while (!(page = follow_page(vma, start, foll_flags))) {
1063 int ret; 1070 int ret;
1064 ret = __handle_mm_fault(mm, vma, start, 1071 ret = handle_mm_fault(mm, vma, start,
1065 foll_flags & FOLL_WRITE); 1072 foll_flags & FOLL_WRITE);
1073 if (ret & VM_FAULT_ERROR) {
1074 if (ret & VM_FAULT_OOM)
1075 return i ? i : -ENOMEM;
1076 else if (ret & VM_FAULT_SIGBUS)
1077 return i ? i : -EFAULT;
1078 BUG();
1079 }
1080 if (ret & VM_FAULT_MAJOR)
1081 tsk->maj_flt++;
1082 else
1083 tsk->min_flt++;
1084
1066 /* 1085 /*
1067 * The VM_FAULT_WRITE bit tells us that do_wp_page has 1086 * The VM_FAULT_WRITE bit tells us that
1068 * broken COW when necessary, even if maybe_mkwrite 1087 * do_wp_page has broken COW when necessary,
1069 * decided not to set pte_write. We can thus safely do 1088 * even if maybe_mkwrite decided not to set
1070 * subsequent page lookups as if they were reads. 1089 * pte_write. We can thus safely do subsequent
1090 * page lookups as if they were reads.
1071 */ 1091 */
1072 if (ret & VM_FAULT_WRITE) 1092 if (ret & VM_FAULT_WRITE)
1073 foll_flags &= ~FOLL_WRITE; 1093 foll_flags &= ~FOLL_WRITE;
1074 1094
1075 switch (ret & ~VM_FAULT_WRITE) {
1076 case VM_FAULT_MINOR:
1077 tsk->min_flt++;
1078 break;
1079 case VM_FAULT_MAJOR:
1080 tsk->maj_flt++;
1081 break;
1082 case VM_FAULT_SIGBUS:
1083 return i ? i : -EFAULT;
1084 case VM_FAULT_OOM:
1085 return i ? i : -ENOMEM;
1086 default:
1087 BUG();
1088 }
1089 cond_resched(); 1095 cond_resched();
1090 } 1096 }
1091 if (pages) { 1097 if (pages) {
@@ -1632,7 +1638,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1632{ 1638{
1633 struct page *old_page, *new_page; 1639 struct page *old_page, *new_page;
1634 pte_t entry; 1640 pte_t entry;
1635 int reuse = 0, ret = VM_FAULT_MINOR; 1641 int reuse = 0, ret = 0;
1636 struct page *dirty_page = NULL; 1642 struct page *dirty_page = NULL;
1637 1643
1638 old_page = vm_normal_page(vma, address, orig_pte); 1644 old_page = vm_normal_page(vma, address, orig_pte);
@@ -1709,11 +1715,11 @@ gotten:
1709 if (unlikely(anon_vma_prepare(vma))) 1715 if (unlikely(anon_vma_prepare(vma)))
1710 goto oom; 1716 goto oom;
1711 if (old_page == ZERO_PAGE(address)) { 1717 if (old_page == ZERO_PAGE(address)) {
1712 new_page = alloc_zeroed_user_highpage(vma, address); 1718 new_page = alloc_zeroed_user_highpage_movable(vma, address);
1713 if (!new_page) 1719 if (!new_page)
1714 goto oom; 1720 goto oom;
1715 } else { 1721 } else {
1716 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); 1722 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1717 if (!new_page) 1723 if (!new_page)
1718 goto oom; 1724 goto oom;
1719 cow_user_page(new_page, old_page, address, vma); 1725 cow_user_page(new_page, old_page, address, vma);
@@ -1759,6 +1765,15 @@ gotten:
1759unlock: 1765unlock:
1760 pte_unmap_unlock(page_table, ptl); 1766 pte_unmap_unlock(page_table, ptl);
1761 if (dirty_page) { 1767 if (dirty_page) {
1768 /*
1769 * Yes, Virginia, this is actually required to prevent a race
1770 * with clear_page_dirty_for_io() from clearing the page dirty
1771 * bit after it clear all dirty ptes, but before a racing
1772 * do_wp_page installs a dirty pte.
1773 *
1774 * do_no_page is protected similarly.
1775 */
1776 wait_on_page_locked(dirty_page);
1762 set_page_dirty_balance(dirty_page); 1777 set_page_dirty_balance(dirty_page);
1763 put_page(dirty_page); 1778 put_page(dirty_page);
1764 } 1779 }
@@ -1825,6 +1840,13 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma,
1825 unsigned long restart_addr; 1840 unsigned long restart_addr;
1826 int need_break; 1841 int need_break;
1827 1842
1843 /*
1844 * files that support invalidating or truncating portions of the
1845 * file from under mmaped areas must have their ->fault function
1846 * return a locked page (and set VM_FAULT_LOCKED in the return).
1847 * This provides synchronisation against concurrent unmapping here.
1848 */
1849
1828again: 1850again:
1829 restart_addr = vma->vm_truncate_count; 1851 restart_addr = vma->vm_truncate_count;
1830 if (is_restart_addr(restart_addr) && start_addr < restart_addr) { 1852 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
@@ -1953,17 +1975,8 @@ void unmap_mapping_range(struct address_space *mapping,
1953 1975
1954 spin_lock(&mapping->i_mmap_lock); 1976 spin_lock(&mapping->i_mmap_lock);
1955 1977
1956 /* serialize i_size write against truncate_count write */ 1978 /* Protect against endless unmapping loops */
1957 smp_wmb();
1958 /* Protect against page faults, and endless unmapping loops */
1959 mapping->truncate_count++; 1979 mapping->truncate_count++;
1960 /*
1961 * For archs where spin_lock has inclusive semantics like ia64
1962 * this smp_mb() will prevent to read pagetable contents
1963 * before the truncate_count increment is visible to
1964 * other cpus.
1965 */
1966 smp_mb();
1967 if (unlikely(is_restart_addr(mapping->truncate_count))) { 1980 if (unlikely(is_restart_addr(mapping->truncate_count))) {
1968 if (mapping->truncate_count == 0) 1981 if (mapping->truncate_count == 0)
1969 reset_vma_truncate_counts(mapping); 1982 reset_vma_truncate_counts(mapping);
@@ -2002,8 +2015,18 @@ int vmtruncate(struct inode * inode, loff_t offset)
2002 if (IS_SWAPFILE(inode)) 2015 if (IS_SWAPFILE(inode))
2003 goto out_busy; 2016 goto out_busy;
2004 i_size_write(inode, offset); 2017 i_size_write(inode, offset);
2018
2019 /*
2020 * unmap_mapping_range is called twice, first simply for efficiency
2021 * so that truncate_inode_pages does fewer single-page unmaps. However
2022 * after this first call, and before truncate_inode_pages finishes,
2023 * it is possible for private pages to be COWed, which remain after
2024 * truncate_inode_pages finishes, hence the second unmap_mapping_range
2025 * call must be made for correctness.
2026 */
2005 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); 2027 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2006 truncate_inode_pages(mapping, offset); 2028 truncate_inode_pages(mapping, offset);
2029 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2007 goto out_truncate; 2030 goto out_truncate;
2008 2031
2009do_expand: 2032do_expand:
@@ -2043,6 +2066,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2043 down_write(&inode->i_alloc_sem); 2066 down_write(&inode->i_alloc_sem);
2044 unmap_mapping_range(mapping, offset, (end - offset), 1); 2067 unmap_mapping_range(mapping, offset, (end - offset), 1);
2045 truncate_inode_pages_range(mapping, offset, end); 2068 truncate_inode_pages_range(mapping, offset, end);
2069 unmap_mapping_range(mapping, offset, (end - offset), 1);
2046 inode->i_op->truncate_range(inode, offset, end); 2070 inode->i_op->truncate_range(inode, offset, end);
2047 up_write(&inode->i_alloc_sem); 2071 up_write(&inode->i_alloc_sem);
2048 mutex_unlock(&inode->i_mutex); 2072 mutex_unlock(&inode->i_mutex);
@@ -2124,7 +2148,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2124 struct page *page; 2148 struct page *page;
2125 swp_entry_t entry; 2149 swp_entry_t entry;
2126 pte_t pte; 2150 pte_t pte;
2127 int ret = VM_FAULT_MINOR; 2151 int ret = 0;
2128 2152
2129 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2153 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2130 goto out; 2154 goto out;
@@ -2192,15 +2216,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2192 unlock_page(page); 2216 unlock_page(page);
2193 2217
2194 if (write_access) { 2218 if (write_access) {
2219 /* XXX: We could OR the do_wp_page code with this one? */
2195 if (do_wp_page(mm, vma, address, 2220 if (do_wp_page(mm, vma, address,
2196 page_table, pmd, ptl, pte) == VM_FAULT_OOM) 2221 page_table, pmd, ptl, pte) & VM_FAULT_OOM)
2197 ret = VM_FAULT_OOM; 2222 ret = VM_FAULT_OOM;
2198 goto out; 2223 goto out;
2199 } 2224 }
2200 2225
2201 /* No need to invalidate - it was non-present before */ 2226 /* No need to invalidate - it was non-present before */
2202 update_mmu_cache(vma, address, pte); 2227 update_mmu_cache(vma, address, pte);
2203 lazy_mmu_prot_update(pte);
2204unlock: 2228unlock:
2205 pte_unmap_unlock(page_table, ptl); 2229 pte_unmap_unlock(page_table, ptl);
2206out: 2230out:
@@ -2231,7 +2255,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2231 2255
2232 if (unlikely(anon_vma_prepare(vma))) 2256 if (unlikely(anon_vma_prepare(vma)))
2233 goto oom; 2257 goto oom;
2234 page = alloc_zeroed_user_highpage(vma, address); 2258 page = alloc_zeroed_user_highpage_movable(vma, address);
2235 if (!page) 2259 if (!page)
2236 goto oom; 2260 goto oom;
2237 2261
@@ -2265,7 +2289,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2265 lazy_mmu_prot_update(entry); 2289 lazy_mmu_prot_update(entry);
2266unlock: 2290unlock:
2267 pte_unmap_unlock(page_table, ptl); 2291 pte_unmap_unlock(page_table, ptl);
2268 return VM_FAULT_MINOR; 2292 return 0;
2269release: 2293release:
2270 page_cache_release(page); 2294 page_cache_release(page);
2271 goto unlock; 2295 goto unlock;
@@ -2274,10 +2298,10 @@ oom:
2274} 2298}
2275 2299
2276/* 2300/*
2277 * do_no_page() tries to create a new page mapping. It aggressively 2301 * __do_fault() tries to create a new page mapping. It aggressively
2278 * tries to share with existing pages, but makes a separate copy if 2302 * tries to share with existing pages, but makes a separate copy if
2279 * the "write_access" parameter is true in order to avoid the next 2303 * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
2280 * page fault. 2304 * the next page fault.
2281 * 2305 *
2282 * As this is called only for pages that do not currently exist, we 2306 * As this is called only for pages that do not currently exist, we
2283 * do not need to flush old virtual caches or the TLB. 2307 * do not need to flush old virtual caches or the TLB.
@@ -2286,89 +2310,100 @@ oom:
2286 * but allow concurrent faults), and pte mapped but not yet locked. 2310 * but allow concurrent faults), and pte mapped but not yet locked.
2287 * We return with mmap_sem still held, but pte unmapped and unlocked. 2311 * We return with mmap_sem still held, but pte unmapped and unlocked.
2288 */ 2312 */
2289static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 2313static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2290 unsigned long address, pte_t *page_table, pmd_t *pmd, 2314 unsigned long address, pte_t *page_table, pmd_t *pmd,
2291 int write_access) 2315 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2292{ 2316{
2293 spinlock_t *ptl; 2317 spinlock_t *ptl;
2294 struct page *new_page; 2318 struct page *page;
2295 struct address_space *mapping = NULL;
2296 pte_t entry; 2319 pte_t entry;
2297 unsigned int sequence = 0;
2298 int ret = VM_FAULT_MINOR;
2299 int anon = 0; 2320 int anon = 0;
2300 struct page *dirty_page = NULL; 2321 struct page *dirty_page = NULL;
2322 struct vm_fault vmf;
2323 int ret;
2324
2325 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
2326 vmf.pgoff = pgoff;
2327 vmf.flags = flags;
2328 vmf.page = NULL;
2301 2329
2302 pte_unmap(page_table); 2330 pte_unmap(page_table);
2303 BUG_ON(vma->vm_flags & VM_PFNMAP); 2331 BUG_ON(vma->vm_flags & VM_PFNMAP);
2304 2332
2305 if (vma->vm_file) { 2333 if (likely(vma->vm_ops->fault)) {
2306 mapping = vma->vm_file->f_mapping; 2334 ret = vma->vm_ops->fault(vma, &vmf);
2307 sequence = mapping->truncate_count; 2335 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2308 smp_rmb(); /* serializes i_size against truncate_count */ 2336 return ret;
2337 } else {
2338 /* Legacy ->nopage path */
2339 ret = 0;
2340 vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
2341 /* no page was available -- either SIGBUS or OOM */
2342 if (unlikely(vmf.page == NOPAGE_SIGBUS))
2343 return VM_FAULT_SIGBUS;
2344 else if (unlikely(vmf.page == NOPAGE_OOM))
2345 return VM_FAULT_OOM;
2309 } 2346 }
2310retry: 2347
2311 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
2312 /* 2348 /*
2313 * No smp_rmb is needed here as long as there's a full 2349 * For consistency in subsequent calls, make the faulted page always
2314 * spin_lock/unlock sequence inside the ->nopage callback 2350 * locked.
2315 * (for the pagecache lookup) that acts as an implicit
2316 * smp_mb() and prevents the i_size read to happen
2317 * after the next truncate_count read.
2318 */ 2351 */
2319 2352 if (unlikely(!(ret & VM_FAULT_LOCKED)))
2320 /* no page was available -- either SIGBUS, OOM or REFAULT */ 2353 lock_page(vmf.page);
2321 if (unlikely(new_page == NOPAGE_SIGBUS)) 2354 else
2322 return VM_FAULT_SIGBUS; 2355 VM_BUG_ON(!PageLocked(vmf.page));
2323 else if (unlikely(new_page == NOPAGE_OOM))
2324 return VM_FAULT_OOM;
2325 else if (unlikely(new_page == NOPAGE_REFAULT))
2326 return VM_FAULT_MINOR;
2327 2356
2328 /* 2357 /*
2329 * Should we do an early C-O-W break? 2358 * Should we do an early C-O-W break?
2330 */ 2359 */
2331 if (write_access) { 2360 page = vmf.page;
2361 if (flags & FAULT_FLAG_WRITE) {
2332 if (!(vma->vm_flags & VM_SHARED)) { 2362 if (!(vma->vm_flags & VM_SHARED)) {
2333 struct page *page;
2334
2335 if (unlikely(anon_vma_prepare(vma)))
2336 goto oom;
2337 page = alloc_page_vma(GFP_HIGHUSER, vma, address);
2338 if (!page)
2339 goto oom;
2340 copy_user_highpage(page, new_page, address, vma);
2341 page_cache_release(new_page);
2342 new_page = page;
2343 anon = 1; 2363 anon = 1;
2344 2364 if (unlikely(anon_vma_prepare(vma))) {
2365 ret = VM_FAULT_OOM;
2366 goto out;
2367 }
2368 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
2369 vma, address);
2370 if (!page) {
2371 ret = VM_FAULT_OOM;
2372 goto out;
2373 }
2374 copy_user_highpage(page, vmf.page, address, vma);
2345 } else { 2375 } else {
2346 /* if the page will be shareable, see if the backing 2376 /*
2377 * If the page will be shareable, see if the backing
2347 * address space wants to know that the page is about 2378 * address space wants to know that the page is about
2348 * to become writable */ 2379 * to become writable
2349 if (vma->vm_ops->page_mkwrite && 2380 */
2350 vma->vm_ops->page_mkwrite(vma, new_page) < 0 2381 if (vma->vm_ops->page_mkwrite) {
2351 ) { 2382 unlock_page(page);
2352 page_cache_release(new_page); 2383 if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
2353 return VM_FAULT_SIGBUS; 2384 ret = VM_FAULT_SIGBUS;
2385 anon = 1; /* no anon but release vmf.page */
2386 goto out_unlocked;
2387 }
2388 lock_page(page);
2389 /*
2390 * XXX: this is not quite right (racy vs
2391 * invalidate) to unlock and relock the page
2392 * like this, however a better fix requires
2393 * reworking page_mkwrite locking API, which
2394 * is better done later.
2395 */
2396 if (!page->mapping) {
2397 ret = 0;
2398 anon = 1; /* no anon but release vmf.page */
2399 goto out;
2400 }
2354 } 2401 }
2355 } 2402 }
2403
2356 } 2404 }
2357 2405
2358 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2406 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2359 /*
2360 * For a file-backed vma, someone could have truncated or otherwise
2361 * invalidated this page. If unmap_mapping_range got called,
2362 * retry getting the page.
2363 */
2364 if (mapping && unlikely(sequence != mapping->truncate_count)) {
2365 pte_unmap_unlock(page_table, ptl);
2366 page_cache_release(new_page);
2367 cond_resched();
2368 sequence = mapping->truncate_count;
2369 smp_rmb();
2370 goto retry;
2371 }
2372 2407
2373 /* 2408 /*
2374 * This silly early PAGE_DIRTY setting removes a race 2409 * This silly early PAGE_DIRTY setting removes a race
@@ -2381,45 +2416,63 @@ retry:
2381 * handle that later. 2416 * handle that later.
2382 */ 2417 */
2383 /* Only go through if we didn't race with anybody else... */ 2418 /* Only go through if we didn't race with anybody else... */
2384 if (pte_none(*page_table)) { 2419 if (likely(pte_same(*page_table, orig_pte))) {
2385 flush_icache_page(vma, new_page); 2420 flush_icache_page(vma, page);
2386 entry = mk_pte(new_page, vma->vm_page_prot); 2421 entry = mk_pte(page, vma->vm_page_prot);
2387 if (write_access) 2422 if (flags & FAULT_FLAG_WRITE)
2388 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2423 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2389 set_pte_at(mm, address, page_table, entry); 2424 set_pte_at(mm, address, page_table, entry);
2390 if (anon) { 2425 if (anon) {
2391 inc_mm_counter(mm, anon_rss); 2426 inc_mm_counter(mm, anon_rss);
2392 lru_cache_add_active(new_page); 2427 lru_cache_add_active(page);
2393 page_add_new_anon_rmap(new_page, vma, address); 2428 page_add_new_anon_rmap(page, vma, address);
2394 } else { 2429 } else {
2395 inc_mm_counter(mm, file_rss); 2430 inc_mm_counter(mm, file_rss);
2396 page_add_file_rmap(new_page); 2431 page_add_file_rmap(page);
2397 if (write_access) { 2432 if (flags & FAULT_FLAG_WRITE) {
2398 dirty_page = new_page; 2433 dirty_page = page;
2399 get_page(dirty_page); 2434 get_page(dirty_page);
2400 } 2435 }
2401 } 2436 }
2437
2438 /* no need to invalidate: a not-present page won't be cached */
2439 update_mmu_cache(vma, address, entry);
2440 lazy_mmu_prot_update(entry);
2402 } else { 2441 } else {
2403 /* One of our sibling threads was faster, back out. */ 2442 if (anon)
2404 page_cache_release(new_page); 2443 page_cache_release(page);
2405 goto unlock; 2444 else
2445 anon = 1; /* no anon but release faulted_page */
2406 } 2446 }
2407 2447
2408 /* no need to invalidate: a not-present page shouldn't be cached */
2409 update_mmu_cache(vma, address, entry);
2410 lazy_mmu_prot_update(entry);
2411unlock:
2412 pte_unmap_unlock(page_table, ptl); 2448 pte_unmap_unlock(page_table, ptl);
2413 if (dirty_page) { 2449
2450out:
2451 unlock_page(vmf.page);
2452out_unlocked:
2453 if (anon)
2454 page_cache_release(vmf.page);
2455 else if (dirty_page) {
2414 set_page_dirty_balance(dirty_page); 2456 set_page_dirty_balance(dirty_page);
2415 put_page(dirty_page); 2457 put_page(dirty_page);
2416 } 2458 }
2459
2417 return ret; 2460 return ret;
2418oom:
2419 page_cache_release(new_page);
2420 return VM_FAULT_OOM;
2421} 2461}
2422 2462
2463static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2464 unsigned long address, pte_t *page_table, pmd_t *pmd,
2465 int write_access, pte_t orig_pte)
2466{
2467 pgoff_t pgoff = (((address & PAGE_MASK)
2468 - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
2469 unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
2470
2471 return __do_fault(mm, vma, address, page_table, pmd, pgoff,
2472 flags, orig_pte);
2473}
2474
2475
2423/* 2476/*
2424 * do_no_pfn() tries to create a new page mapping for a page without 2477 * do_no_pfn() tries to create a new page mapping for a page without
2425 * a struct_page backing it 2478 * a struct_page backing it
@@ -2443,7 +2496,6 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2443 spinlock_t *ptl; 2496 spinlock_t *ptl;
2444 pte_t entry; 2497 pte_t entry;
2445 unsigned long pfn; 2498 unsigned long pfn;
2446 int ret = VM_FAULT_MINOR;
2447 2499
2448 pte_unmap(page_table); 2500 pte_unmap(page_table);
2449 BUG_ON(!(vma->vm_flags & VM_PFNMAP)); 2501 BUG_ON(!(vma->vm_flags & VM_PFNMAP));
@@ -2455,7 +2507,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2455 else if (unlikely(pfn == NOPFN_SIGBUS)) 2507 else if (unlikely(pfn == NOPFN_SIGBUS))
2456 return VM_FAULT_SIGBUS; 2508 return VM_FAULT_SIGBUS;
2457 else if (unlikely(pfn == NOPFN_REFAULT)) 2509 else if (unlikely(pfn == NOPFN_REFAULT))
2458 return VM_FAULT_MINOR; 2510 return 0;
2459 2511
2460 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2512 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2461 2513
@@ -2467,7 +2519,7 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2467 set_pte_at(mm, address, page_table, entry); 2519 set_pte_at(mm, address, page_table, entry);
2468 } 2520 }
2469 pte_unmap_unlock(page_table, ptl); 2521 pte_unmap_unlock(page_table, ptl);
2470 return ret; 2522 return 0;
2471} 2523}
2472 2524
2473/* 2525/*
@@ -2479,33 +2531,30 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2479 * but allow concurrent faults), and pte mapped but not yet locked. 2531 * but allow concurrent faults), and pte mapped but not yet locked.
2480 * We return with mmap_sem still held, but pte unmapped and unlocked. 2532 * We return with mmap_sem still held, but pte unmapped and unlocked.
2481 */ 2533 */
2482static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, 2534static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2483 unsigned long address, pte_t *page_table, pmd_t *pmd, 2535 unsigned long address, pte_t *page_table, pmd_t *pmd,
2484 int write_access, pte_t orig_pte) 2536 int write_access, pte_t orig_pte)
2485{ 2537{
2538 unsigned int flags = FAULT_FLAG_NONLINEAR |
2539 (write_access ? FAULT_FLAG_WRITE : 0);
2486 pgoff_t pgoff; 2540 pgoff_t pgoff;
2487 int err;
2488 2541
2489 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2542 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2490 return VM_FAULT_MINOR; 2543 return 0;
2491 2544
2492 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { 2545 if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
2546 !(vma->vm_flags & VM_CAN_NONLINEAR))) {
2493 /* 2547 /*
2494 * Page table corrupted: show pte and kill process. 2548 * Page table corrupted: show pte and kill process.
2495 */ 2549 */
2496 print_bad_pte(vma, orig_pte, address); 2550 print_bad_pte(vma, orig_pte, address);
2497 return VM_FAULT_OOM; 2551 return VM_FAULT_OOM;
2498 } 2552 }
2499 /* We can then assume vm->vm_ops && vma->vm_ops->populate */
2500 2553
2501 pgoff = pte_to_pgoff(orig_pte); 2554 pgoff = pte_to_pgoff(orig_pte);
2502 err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, 2555
2503 vma->vm_page_prot, pgoff, 0); 2556 return __do_fault(mm, vma, address, page_table, pmd, pgoff,
2504 if (err == -ENOMEM) 2557 flags, orig_pte);
2505 return VM_FAULT_OOM;
2506 if (err)
2507 return VM_FAULT_SIGBUS;
2508 return VM_FAULT_MAJOR;
2509} 2558}
2510 2559
2511/* 2560/*
@@ -2532,10 +2581,9 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2532 if (!pte_present(entry)) { 2581 if (!pte_present(entry)) {
2533 if (pte_none(entry)) { 2582 if (pte_none(entry)) {
2534 if (vma->vm_ops) { 2583 if (vma->vm_ops) {
2535 if (vma->vm_ops->nopage) 2584 if (vma->vm_ops->fault || vma->vm_ops->nopage)
2536 return do_no_page(mm, vma, address, 2585 return do_linear_fault(mm, vma, address,
2537 pte, pmd, 2586 pte, pmd, write_access, entry);
2538 write_access);
2539 if (unlikely(vma->vm_ops->nopfn)) 2587 if (unlikely(vma->vm_ops->nopfn))
2540 return do_no_pfn(mm, vma, address, pte, 2588 return do_no_pfn(mm, vma, address, pte,
2541 pmd, write_access); 2589 pmd, write_access);
@@ -2544,7 +2592,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2544 pte, pmd, write_access); 2592 pte, pmd, write_access);
2545 } 2593 }
2546 if (pte_file(entry)) 2594 if (pte_file(entry))
2547 return do_file_page(mm, vma, address, 2595 return do_nonlinear_fault(mm, vma, address,
2548 pte, pmd, write_access, entry); 2596 pte, pmd, write_access, entry);
2549 return do_swap_page(mm, vma, address, 2597 return do_swap_page(mm, vma, address,
2550 pte, pmd, write_access, entry); 2598 pte, pmd, write_access, entry);
@@ -2576,13 +2624,13 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2576 } 2624 }
2577unlock: 2625unlock:
2578 pte_unmap_unlock(pte, ptl); 2626 pte_unmap_unlock(pte, ptl);
2579 return VM_FAULT_MINOR; 2627 return 0;
2580} 2628}
2581 2629
2582/* 2630/*
2583 * By the time we get here, we already hold the mm semaphore 2631 * By the time we get here, we already hold the mm semaphore
2584 */ 2632 */
2585int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2633int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2586 unsigned long address, int write_access) 2634 unsigned long address, int write_access)
2587{ 2635{
2588 pgd_t *pgd; 2636 pgd_t *pgd;
@@ -2611,8 +2659,6 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2611 return handle_pte_fault(mm, vma, address, pte, pmd, write_access); 2659 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2612} 2660}
2613 2661
2614EXPORT_SYMBOL_GPL(__handle_mm_fault);
2615
2616#ifndef __PAGETABLE_PUD_FOLDED 2662#ifndef __PAGETABLE_PUD_FOLDED
2617/* 2663/*
2618 * Allocate page upper directory. 2664 * Allocate page upper directory.
@@ -2673,7 +2719,7 @@ int make_pages_present(unsigned long addr, unsigned long end)
2673 write = (vma->vm_flags & VM_WRITE) != 0; 2719 write = (vma->vm_flags & VM_WRITE) != 0;
2674 BUG_ON(addr >= end); 2720 BUG_ON(addr >= end);
2675 BUG_ON(end > vma->vm_end); 2721 BUG_ON(end > vma->vm_end);
2676 len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; 2722 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
2677 ret = get_user_pages(current, current->mm, addr, 2723 ret = get_user_pages(current, current->mm, addr,
2678 len, write, 0, NULL, NULL); 2724 len, write, 0, NULL, NULL);
2679 if (ret < 0) 2725 if (ret < 0)
@@ -2817,3 +2863,4 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2817 2863
2818 return buf - old_buf; 2864 return buf - old_buf;
2819} 2865}
2866EXPORT_SYMBOL_GPL(access_process_vm);