aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2007-07-19 04:46:59 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-19 13:04:41 -0400
commit54cb8821de07f2ffcd28c380ce9b93d5784b40d7 (patch)
tree1de676534963d96af42863b20191bc9f80060dea /mm/memory.c
parentd00806b183152af6d24f46f0c33f14162ca1262a (diff)
mm: merge populate and nopage into fault (fixes nonlinear)
Nonlinear mappings are (AFAIKS) simply a virtual memory concept that encodes the virtual address -> file offset differently from linear mappings. ->populate is a layering violation because the filesystem/pagecache code should need to know anything about the virtual memory mapping. The hitch here is that the ->nopage handler didn't pass down enough information (ie. pgoff). But it is more logical to pass pgoff rather than have the ->nopage function calculate it itself anyway (because that's a similar layering violation). Having the populate handler install the pte itself is likewise a nasty thing to be doing. This patch introduces a new fault handler that replaces ->nopage and ->populate and (later) ->nopfn. Most of the old mechanism is still in place so there is a lot of duplication and nice cleanups that can be removed if everyone switches over. The rationale for doing this in the first place is that nonlinear mappings are subject to the pagefault vs invalidate/truncate race too, and it seemed stupid to duplicate the synchronisation logic rather than just consolidate the two. After this patch, MAP_NONBLOCK no longer sets up ptes for pages present in pagecache. Seems like a fringe functionality anyway. NOPAGE_REFAULT is removed. This should be implemented with ->fault, and no users have hit mainline yet. [akpm@linux-foundation.org: cleanup] [randy.dunlap@oracle.com: doc. fixes for readahead] [akpm@linux-foundation.org: build fix] Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Cc: Mark Fasheh <mark.fasheh@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c132
1 files changed, 86 insertions, 46 deletions
diff --git a/mm/memory.c b/mm/memory.c
index e6c99f6b5649..eee7fec3ab54 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1047,7 +1047,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1047 if (pages) 1047 if (pages)
1048 foll_flags |= FOLL_GET; 1048 foll_flags |= FOLL_GET;
1049 if (!write && !(vma->vm_flags & VM_LOCKED) && 1049 if (!write && !(vma->vm_flags & VM_LOCKED) &&
1050 (!vma->vm_ops || !vma->vm_ops->nopage)) 1050 (!vma->vm_ops || (!vma->vm_ops->nopage &&
1051 !vma->vm_ops->fault)))
1051 foll_flags |= FOLL_ANON; 1052 foll_flags |= FOLL_ANON;
1052 1053
1053 do { 1054 do {
@@ -2288,10 +2289,10 @@ oom:
2288} 2289}
2289 2290
2290/* 2291/*
2291 * do_no_page() tries to create a new page mapping. It aggressively 2292 * __do_fault() tries to create a new page mapping. It aggressively
2292 * tries to share with existing pages, but makes a separate copy if 2293 * tries to share with existing pages, but makes a separate copy if
2293 * the "write_access" parameter is true in order to avoid the next 2294 * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
2294 * page fault. 2295 * the next page fault.
2295 * 2296 *
2296 * As this is called only for pages that do not currently exist, we 2297 * As this is called only for pages that do not currently exist, we
2297 * do not need to flush old virtual caches or the TLB. 2298 * do not need to flush old virtual caches or the TLB.
@@ -2300,64 +2301,82 @@ oom:
2300 * but allow concurrent faults), and pte mapped but not yet locked. 2301 * but allow concurrent faults), and pte mapped but not yet locked.
2301 * We return with mmap_sem still held, but pte unmapped and unlocked. 2302 * We return with mmap_sem still held, but pte unmapped and unlocked.
2302 */ 2303 */
2303static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 2304static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2304 unsigned long address, pte_t *page_table, pmd_t *pmd, 2305 unsigned long address, pte_t *page_table, pmd_t *pmd,
2305 int write_access) 2306 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
2306{ 2307{
2307 spinlock_t *ptl; 2308 spinlock_t *ptl;
2308 struct page *page, *nopage_page; 2309 struct page *page, *faulted_page;
2309 pte_t entry; 2310 pte_t entry;
2310 int ret = VM_FAULT_MINOR;
2311 int anon = 0; 2311 int anon = 0;
2312 struct page *dirty_page = NULL; 2312 struct page *dirty_page = NULL;
2313 struct fault_data fdata;
2314
2315 fdata.address = address & PAGE_MASK;
2316 fdata.pgoff = pgoff;
2317 fdata.flags = flags;
2313 2318
2314 pte_unmap(page_table); 2319 pte_unmap(page_table);
2315 BUG_ON(vma->vm_flags & VM_PFNMAP); 2320 BUG_ON(vma->vm_flags & VM_PFNMAP);
2316 2321
2317 nopage_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); 2322 if (likely(vma->vm_ops->fault)) {
2318 /* no page was available -- either SIGBUS, OOM or REFAULT */ 2323 fdata.type = -1;
2319 if (unlikely(nopage_page == NOPAGE_SIGBUS)) 2324 faulted_page = vma->vm_ops->fault(vma, &fdata);
2320 return VM_FAULT_SIGBUS; 2325 WARN_ON(fdata.type == -1);
2321 else if (unlikely(nopage_page == NOPAGE_OOM)) 2326 if (unlikely(!faulted_page))
2322 return VM_FAULT_OOM; 2327 return fdata.type;
2323 else if (unlikely(nopage_page == NOPAGE_REFAULT)) 2328 } else {
2324 return VM_FAULT_MINOR; 2329 /* Legacy ->nopage path */
2330 fdata.type = VM_FAULT_MINOR;
2331 faulted_page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
2332 &fdata.type);
2333 /* no page was available -- either SIGBUS or OOM */
2334 if (unlikely(faulted_page == NOPAGE_SIGBUS))
2335 return VM_FAULT_SIGBUS;
2336 else if (unlikely(faulted_page == NOPAGE_OOM))
2337 return VM_FAULT_OOM;
2338 }
2325 2339
2326 BUG_ON(vma->vm_flags & VM_CAN_INVALIDATE && !PageLocked(nopage_page));
2327 /* 2340 /*
2328 * For consistency in subsequent calls, make the nopage_page always 2341 * For consistency in subsequent calls, make the faulted_page always
2329 * locked. 2342 * locked.
2330 */ 2343 */
2331 if (unlikely(!(vma->vm_flags & VM_CAN_INVALIDATE))) 2344 if (unlikely(!(vma->vm_flags & VM_CAN_INVALIDATE)))
2332 lock_page(nopage_page); 2345 lock_page(faulted_page);
2346 else
2347 BUG_ON(!PageLocked(faulted_page));
2333 2348
2334 /* 2349 /*
2335 * Should we do an early C-O-W break? 2350 * Should we do an early C-O-W break?
2336 */ 2351 */
2337 page = nopage_page; 2352 page = faulted_page;
2338 if (write_access) { 2353 if (flags & FAULT_FLAG_WRITE) {
2339 if (!(vma->vm_flags & VM_SHARED)) { 2354 if (!(vma->vm_flags & VM_SHARED)) {
2355 anon = 1;
2340 if (unlikely(anon_vma_prepare(vma))) { 2356 if (unlikely(anon_vma_prepare(vma))) {
2341 ret = VM_FAULT_OOM; 2357 fdata.type = VM_FAULT_OOM;
2342 goto out_error; 2358 goto out;
2343 } 2359 }
2344 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2360 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2345 if (!page) { 2361 if (!page) {
2346 ret = VM_FAULT_OOM; 2362 fdata.type = VM_FAULT_OOM;
2347 goto out_error; 2363 goto out;
2348 } 2364 }
2349 copy_user_highpage(page, nopage_page, address, vma); 2365 copy_user_highpage(page, faulted_page, address, vma);
2350 anon = 1;
2351 } else { 2366 } else {
2352 /* if the page will be shareable, see if the backing 2367 /*
2368 * If the page will be shareable, see if the backing
2353 * address space wants to know that the page is about 2369 * address space wants to know that the page is about
2354 * to become writable */ 2370 * to become writable
2371 */
2355 if (vma->vm_ops->page_mkwrite && 2372 if (vma->vm_ops->page_mkwrite &&
2356 vma->vm_ops->page_mkwrite(vma, page) < 0) { 2373 vma->vm_ops->page_mkwrite(vma, page) < 0) {
2357 ret = VM_FAULT_SIGBUS; 2374 fdata.type = VM_FAULT_SIGBUS;
2358 goto out_error; 2375 anon = 1; /* no anon but release faulted_page */
2376 goto out;
2359 } 2377 }
2360 } 2378 }
2379
2361 } 2380 }
2362 2381
2363 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2382 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2373,10 +2392,10 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2373 * handle that later. 2392 * handle that later.
2374 */ 2393 */
2375 /* Only go through if we didn't race with anybody else... */ 2394 /* Only go through if we didn't race with anybody else... */
2376 if (likely(pte_none(*page_table))) { 2395 if (likely(pte_same(*page_table, orig_pte))) {
2377 flush_icache_page(vma, page); 2396 flush_icache_page(vma, page);
2378 entry = mk_pte(page, vma->vm_page_prot); 2397 entry = mk_pte(page, vma->vm_page_prot);
2379 if (write_access) 2398 if (flags & FAULT_FLAG_WRITE)
2380 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2399 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2381 set_pte_at(mm, address, page_table, entry); 2400 set_pte_at(mm, address, page_table, entry);
2382 if (anon) { 2401 if (anon) {
@@ -2386,7 +2405,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2386 } else { 2405 } else {
2387 inc_mm_counter(mm, file_rss); 2406 inc_mm_counter(mm, file_rss);
2388 page_add_file_rmap(page); 2407 page_add_file_rmap(page);
2389 if (write_access) { 2408 if (flags & FAULT_FLAG_WRITE) {
2390 dirty_page = page; 2409 dirty_page = page;
2391 get_page(dirty_page); 2410 get_page(dirty_page);
2392 } 2411 }
@@ -2399,25 +2418,42 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2399 if (anon) 2418 if (anon)
2400 page_cache_release(page); 2419 page_cache_release(page);
2401 else 2420 else
2402 anon = 1; /* not anon, but release nopage_page */ 2421 anon = 1; /* no anon but release faulted_page */
2403 } 2422 }
2404 2423
2405 pte_unmap_unlock(page_table, ptl); 2424 pte_unmap_unlock(page_table, ptl);
2406 2425
2407out: 2426out:
2408 unlock_page(nopage_page); 2427 unlock_page(faulted_page);
2409 if (anon) 2428 if (anon)
2410 page_cache_release(nopage_page); 2429 page_cache_release(faulted_page);
2411 else if (dirty_page) { 2430 else if (dirty_page) {
2412 set_page_dirty_balance(dirty_page); 2431 set_page_dirty_balance(dirty_page);
2413 put_page(dirty_page); 2432 put_page(dirty_page);
2414 } 2433 }
2415 2434
2416 return ret; 2435 return fdata.type;
2436}
2417 2437
2418out_error: 2438static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2419 anon = 1; /* relase nopage_page */ 2439 unsigned long address, pte_t *page_table, pmd_t *pmd,
2420 goto out; 2440 int write_access, pte_t orig_pte)
2441{
2442 pgoff_t pgoff = (((address & PAGE_MASK)
2443 - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
2444 unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
2445
2446 return __do_fault(mm, vma, address, page_table, pmd, pgoff, flags, orig_pte);
2447}
2448
2449static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2450 unsigned long address, pte_t *page_table, pmd_t *pmd,
2451 int write_access, pgoff_t pgoff, pte_t orig_pte)
2452{
2453 unsigned int flags = FAULT_FLAG_NONLINEAR |
2454 (write_access ? FAULT_FLAG_WRITE : 0);
2455
2456 return __do_fault(mm, vma, address, page_table, pmd, pgoff, flags, orig_pte);
2421} 2457}
2422 2458
2423/* 2459/*
@@ -2496,9 +2532,14 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
2496 print_bad_pte(vma, orig_pte, address); 2532 print_bad_pte(vma, orig_pte, address);
2497 return VM_FAULT_OOM; 2533 return VM_FAULT_OOM;
2498 } 2534 }
2499 /* We can then assume vm->vm_ops && vma->vm_ops->populate */
2500 2535
2501 pgoff = pte_to_pgoff(orig_pte); 2536 pgoff = pte_to_pgoff(orig_pte);
2537
2538 if (vma->vm_ops && vma->vm_ops->fault)
2539 return do_nonlinear_fault(mm, vma, address, page_table, pmd,
2540 write_access, pgoff, orig_pte);
2541
2542 /* We can then assume vm->vm_ops && vma->vm_ops->populate */
2502 err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, 2543 err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
2503 vma->vm_page_prot, pgoff, 0); 2544 vma->vm_page_prot, pgoff, 0);
2504 if (err == -ENOMEM) 2545 if (err == -ENOMEM)
@@ -2532,10 +2573,9 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2532 if (!pte_present(entry)) { 2573 if (!pte_present(entry)) {
2533 if (pte_none(entry)) { 2574 if (pte_none(entry)) {
2534 if (vma->vm_ops) { 2575 if (vma->vm_ops) {
2535 if (vma->vm_ops->nopage) 2576 if (vma->vm_ops->fault || vma->vm_ops->nopage)
2536 return do_no_page(mm, vma, address, 2577 return do_linear_fault(mm, vma, address,
2537 pte, pmd, 2578 pte, pmd, write_access, entry);
2538 write_access);
2539 if (unlikely(vma->vm_ops->nopfn)) 2579 if (unlikely(vma->vm_ops->nopfn))
2540 return do_no_pfn(mm, vma, address, pte, 2580 return do_no_pfn(mm, vma, address, pte,
2541 pmd, write_access); 2581 pmd, write_access);