diff options
author | Nick Piggin <npiggin@suse.de> | 2007-07-19 04:46:59 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-19 13:04:41 -0400 |
commit | 54cb8821de07f2ffcd28c380ce9b93d5784b40d7 (patch) | |
tree | 1de676534963d96af42863b20191bc9f80060dea /mm/memory.c | |
parent | d00806b183152af6d24f46f0c33f14162ca1262a (diff) |
mm: merge populate and nopage into fault (fixes nonlinear)
Nonlinear mappings are (AFAIKS) simply a virtual memory concept that encodes
the virtual address -> file offset differently from linear mappings.
->populate is a layering violation because the filesystem/pagecache code
should need to know anything about the virtual memory mapping. The hitch here
is that the ->nopage handler didn't pass down enough information (ie. pgoff).
But it is more logical to pass pgoff rather than have the ->nopage function
calculate it itself anyway (because that's a similar layering violation).
Having the populate handler install the pte itself is likewise a nasty thing
to be doing.
This patch introduces a new fault handler that replaces ->nopage and
->populate and (later) ->nopfn. Most of the old mechanism is still in place
so there is a lot of duplication and nice cleanups that can be removed if
everyone switches over.
The rationale for doing this in the first place is that nonlinear mappings are
subject to the pagefault vs invalidate/truncate race too, and it seemed stupid
to duplicate the synchronisation logic rather than just consolidate the two.
After this patch, MAP_NONBLOCK no longer sets up ptes for pages present in
pagecache. Seems like a fringe functionality anyway.
NOPAGE_REFAULT is removed. This should be implemented with ->fault, and no
users have hit mainline yet.
[akpm@linux-foundation.org: cleanup]
[randy.dunlap@oracle.com: doc. fixes for readahead]
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 132 |
1 files changed, 86 insertions, 46 deletions
diff --git a/mm/memory.c b/mm/memory.c index e6c99f6b5649..eee7fec3ab54 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1047,7 +1047,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1047 | if (pages) | 1047 | if (pages) |
1048 | foll_flags |= FOLL_GET; | 1048 | foll_flags |= FOLL_GET; |
1049 | if (!write && !(vma->vm_flags & VM_LOCKED) && | 1049 | if (!write && !(vma->vm_flags & VM_LOCKED) && |
1050 | (!vma->vm_ops || !vma->vm_ops->nopage)) | 1050 | (!vma->vm_ops || (!vma->vm_ops->nopage && |
1051 | !vma->vm_ops->fault))) | ||
1051 | foll_flags |= FOLL_ANON; | 1052 | foll_flags |= FOLL_ANON; |
1052 | 1053 | ||
1053 | do { | 1054 | do { |
@@ -2288,10 +2289,10 @@ oom: | |||
2288 | } | 2289 | } |
2289 | 2290 | ||
2290 | /* | 2291 | /* |
2291 | * do_no_page() tries to create a new page mapping. It aggressively | 2292 | * __do_fault() tries to create a new page mapping. It aggressively |
2292 | * tries to share with existing pages, but makes a separate copy if | 2293 | * tries to share with existing pages, but makes a separate copy if |
2293 | * the "write_access" parameter is true in order to avoid the next | 2294 | * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid |
2294 | * page fault. | 2295 | * the next page fault. |
2295 | * | 2296 | * |
2296 | * As this is called only for pages that do not currently exist, we | 2297 | * As this is called only for pages that do not currently exist, we |
2297 | * do not need to flush old virtual caches or the TLB. | 2298 | * do not need to flush old virtual caches or the TLB. |
@@ -2300,64 +2301,82 @@ oom: | |||
2300 | * but allow concurrent faults), and pte mapped but not yet locked. | 2301 | * but allow concurrent faults), and pte mapped but not yet locked. |
2301 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2302 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2302 | */ | 2303 | */ |
2303 | static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2304 | static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2304 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2305 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2305 | int write_access) | 2306 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) |
2306 | { | 2307 | { |
2307 | spinlock_t *ptl; | 2308 | spinlock_t *ptl; |
2308 | struct page *page, *nopage_page; | 2309 | struct page *page, *faulted_page; |
2309 | pte_t entry; | 2310 | pte_t entry; |
2310 | int ret = VM_FAULT_MINOR; | ||
2311 | int anon = 0; | 2311 | int anon = 0; |
2312 | struct page *dirty_page = NULL; | 2312 | struct page *dirty_page = NULL; |
2313 | struct fault_data fdata; | ||
2314 | |||
2315 | fdata.address = address & PAGE_MASK; | ||
2316 | fdata.pgoff = pgoff; | ||
2317 | fdata.flags = flags; | ||
2313 | 2318 | ||
2314 | pte_unmap(page_table); | 2319 | pte_unmap(page_table); |
2315 | BUG_ON(vma->vm_flags & VM_PFNMAP); | 2320 | BUG_ON(vma->vm_flags & VM_PFNMAP); |
2316 | 2321 | ||
2317 | nopage_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); | 2322 | if (likely(vma->vm_ops->fault)) { |
2318 | /* no page was available -- either SIGBUS, OOM or REFAULT */ | 2323 | fdata.type = -1; |
2319 | if (unlikely(nopage_page == NOPAGE_SIGBUS)) | 2324 | faulted_page = vma->vm_ops->fault(vma, &fdata); |
2320 | return VM_FAULT_SIGBUS; | 2325 | WARN_ON(fdata.type == -1); |
2321 | else if (unlikely(nopage_page == NOPAGE_OOM)) | 2326 | if (unlikely(!faulted_page)) |
2322 | return VM_FAULT_OOM; | 2327 | return fdata.type; |
2323 | else if (unlikely(nopage_page == NOPAGE_REFAULT)) | 2328 | } else { |
2324 | return VM_FAULT_MINOR; | 2329 | /* Legacy ->nopage path */ |
2330 | fdata.type = VM_FAULT_MINOR; | ||
2331 | faulted_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, | ||
2332 | &fdata.type); | ||
2333 | /* no page was available -- either SIGBUS or OOM */ | ||
2334 | if (unlikely(faulted_page == NOPAGE_SIGBUS)) | ||
2335 | return VM_FAULT_SIGBUS; | ||
2336 | else if (unlikely(faulted_page == NOPAGE_OOM)) | ||
2337 | return VM_FAULT_OOM; | ||
2338 | } | ||
2325 | 2339 | ||
2326 | BUG_ON(vma->vm_flags & VM_CAN_INVALIDATE && !PageLocked(nopage_page)); | ||
2327 | /* | 2340 | /* |
2328 | * For consistency in subsequent calls, make the nopage_page always | 2341 | * For consistency in subsequent calls, make the faulted_page always |
2329 | * locked. | 2342 | * locked. |
2330 | */ | 2343 | */ |
2331 | if (unlikely(!(vma->vm_flags & VM_CAN_INVALIDATE))) | 2344 | if (unlikely(!(vma->vm_flags & VM_CAN_INVALIDATE))) |
2332 | lock_page(nopage_page); | 2345 | lock_page(faulted_page); |
2346 | else | ||
2347 | BUG_ON(!PageLocked(faulted_page)); | ||
2333 | 2348 | ||
2334 | /* | 2349 | /* |
2335 | * Should we do an early C-O-W break? | 2350 | * Should we do an early C-O-W break? |
2336 | */ | 2351 | */ |
2337 | page = nopage_page; | 2352 | page = faulted_page; |
2338 | if (write_access) { | 2353 | if (flags & FAULT_FLAG_WRITE) { |
2339 | if (!(vma->vm_flags & VM_SHARED)) { | 2354 | if (!(vma->vm_flags & VM_SHARED)) { |
2355 | anon = 1; | ||
2340 | if (unlikely(anon_vma_prepare(vma))) { | 2356 | if (unlikely(anon_vma_prepare(vma))) { |
2341 | ret = VM_FAULT_OOM; | 2357 | fdata.type = VM_FAULT_OOM; |
2342 | goto out_error; | 2358 | goto out; |
2343 | } | 2359 | } |
2344 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 2360 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
2345 | if (!page) { | 2361 | if (!page) { |
2346 | ret = VM_FAULT_OOM; | 2362 | fdata.type = VM_FAULT_OOM; |
2347 | goto out_error; | 2363 | goto out; |
2348 | } | 2364 | } |
2349 | copy_user_highpage(page, nopage_page, address, vma); | 2365 | copy_user_highpage(page, faulted_page, address, vma); |
2350 | anon = 1; | ||
2351 | } else { | 2366 | } else { |
2352 | /* if the page will be shareable, see if the backing | 2367 | /* |
2368 | * If the page will be shareable, see if the backing | ||
2353 | * address space wants to know that the page is about | 2369 | * address space wants to know that the page is about |
2354 | * to become writable */ | 2370 | * to become writable |
2371 | */ | ||
2355 | if (vma->vm_ops->page_mkwrite && | 2372 | if (vma->vm_ops->page_mkwrite && |
2356 | vma->vm_ops->page_mkwrite(vma, page) < 0) { | 2373 | vma->vm_ops->page_mkwrite(vma, page) < 0) { |
2357 | ret = VM_FAULT_SIGBUS; | 2374 | fdata.type = VM_FAULT_SIGBUS; |
2358 | goto out_error; | 2375 | anon = 1; /* no anon but release faulted_page */ |
2376 | goto out; | ||
2359 | } | 2377 | } |
2360 | } | 2378 | } |
2379 | |||
2361 | } | 2380 | } |
2362 | 2381 | ||
2363 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2382 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
@@ -2373,10 +2392,10 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2373 | * handle that later. | 2392 | * handle that later. |
2374 | */ | 2393 | */ |
2375 | /* Only go through if we didn't race with anybody else... */ | 2394 | /* Only go through if we didn't race with anybody else... */ |
2376 | if (likely(pte_none(*page_table))) { | 2395 | if (likely(pte_same(*page_table, orig_pte))) { |
2377 | flush_icache_page(vma, page); | 2396 | flush_icache_page(vma, page); |
2378 | entry = mk_pte(page, vma->vm_page_prot); | 2397 | entry = mk_pte(page, vma->vm_page_prot); |
2379 | if (write_access) | 2398 | if (flags & FAULT_FLAG_WRITE) |
2380 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2399 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2381 | set_pte_at(mm, address, page_table, entry); | 2400 | set_pte_at(mm, address, page_table, entry); |
2382 | if (anon) { | 2401 | if (anon) { |
@@ -2386,7 +2405,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2386 | } else { | 2405 | } else { |
2387 | inc_mm_counter(mm, file_rss); | 2406 | inc_mm_counter(mm, file_rss); |
2388 | page_add_file_rmap(page); | 2407 | page_add_file_rmap(page); |
2389 | if (write_access) { | 2408 | if (flags & FAULT_FLAG_WRITE) { |
2390 | dirty_page = page; | 2409 | dirty_page = page; |
2391 | get_page(dirty_page); | 2410 | get_page(dirty_page); |
2392 | } | 2411 | } |
@@ -2399,25 +2418,42 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2399 | if (anon) | 2418 | if (anon) |
2400 | page_cache_release(page); | 2419 | page_cache_release(page); |
2401 | else | 2420 | else |
2402 | anon = 1; /* not anon, but release nopage_page */ | 2421 | anon = 1; /* no anon but release faulted_page */ |
2403 | } | 2422 | } |
2404 | 2423 | ||
2405 | pte_unmap_unlock(page_table, ptl); | 2424 | pte_unmap_unlock(page_table, ptl); |
2406 | 2425 | ||
2407 | out: | 2426 | out: |
2408 | unlock_page(nopage_page); | 2427 | unlock_page(faulted_page); |
2409 | if (anon) | 2428 | if (anon) |
2410 | page_cache_release(nopage_page); | 2429 | page_cache_release(faulted_page); |
2411 | else if (dirty_page) { | 2430 | else if (dirty_page) { |
2412 | set_page_dirty_balance(dirty_page); | 2431 | set_page_dirty_balance(dirty_page); |
2413 | put_page(dirty_page); | 2432 | put_page(dirty_page); |
2414 | } | 2433 | } |
2415 | 2434 | ||
2416 | return ret; | 2435 | return fdata.type; |
2436 | } | ||
2417 | 2437 | ||
2418 | out_error: | 2438 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
2419 | anon = 1; /* relase nopage_page */ | 2439 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
2420 | goto out; | 2440 | int write_access, pte_t orig_pte) |
2441 | { | ||
2442 | pgoff_t pgoff = (((address & PAGE_MASK) | ||
2443 | - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; | ||
2444 | unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); | ||
2445 | |||
2446 | return __do_fault(mm, vma, address, page_table, pmd, pgoff, flags, orig_pte); | ||
2447 | } | ||
2448 | |||
2449 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | ||
2450 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
2451 | int write_access, pgoff_t pgoff, pte_t orig_pte) | ||
2452 | { | ||
2453 | unsigned int flags = FAULT_FLAG_NONLINEAR | | ||
2454 | (write_access ? FAULT_FLAG_WRITE : 0); | ||
2455 | |||
2456 | return __do_fault(mm, vma, address, page_table, pmd, pgoff, flags, orig_pte); | ||
2421 | } | 2457 | } |
2422 | 2458 | ||
2423 | /* | 2459 | /* |
@@ -2496,9 +2532,14 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2496 | print_bad_pte(vma, orig_pte, address); | 2532 | print_bad_pte(vma, orig_pte, address); |
2497 | return VM_FAULT_OOM; | 2533 | return VM_FAULT_OOM; |
2498 | } | 2534 | } |
2499 | /* We can then assume vm->vm_ops && vma->vm_ops->populate */ | ||
2500 | 2535 | ||
2501 | pgoff = pte_to_pgoff(orig_pte); | 2536 | pgoff = pte_to_pgoff(orig_pte); |
2537 | |||
2538 | if (vma->vm_ops && vma->vm_ops->fault) | ||
2539 | return do_nonlinear_fault(mm, vma, address, page_table, pmd, | ||
2540 | write_access, pgoff, orig_pte); | ||
2541 | |||
2542 | /* We can then assume vm->vm_ops && vma->vm_ops->populate */ | ||
2502 | err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, | 2543 | err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, |
2503 | vma->vm_page_prot, pgoff, 0); | 2544 | vma->vm_page_prot, pgoff, 0); |
2504 | if (err == -ENOMEM) | 2545 | if (err == -ENOMEM) |
@@ -2532,10 +2573,9 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2532 | if (!pte_present(entry)) { | 2573 | if (!pte_present(entry)) { |
2533 | if (pte_none(entry)) { | 2574 | if (pte_none(entry)) { |
2534 | if (vma->vm_ops) { | 2575 | if (vma->vm_ops) { |
2535 | if (vma->vm_ops->nopage) | 2576 | if (vma->vm_ops->fault || vma->vm_ops->nopage) |
2536 | return do_no_page(mm, vma, address, | 2577 | return do_linear_fault(mm, vma, address, |
2537 | pte, pmd, | 2578 | pte, pmd, write_access, entry); |
2538 | write_access); | ||
2539 | if (unlikely(vma->vm_ops->nopfn)) | 2579 | if (unlikely(vma->vm_ops->nopfn)) |
2540 | return do_no_pfn(mm, vma, address, pte, | 2580 | return do_no_pfn(mm, vma, address, pte, |
2541 | pmd, write_access); | 2581 | pmd, write_access); |