diff options
-rw-r--r-- | Documentation/filesystems/Locking | 24 | ||||
-rw-r--r-- | fs/buffer.c | 10 | ||||
-rw-r--r-- | mm/memory.c | 108 |
3 files changed, 98 insertions, 44 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 76efe5b71d7d..3120f8dd2c31 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -512,16 +512,24 @@ locking rules: | |||
512 | BKL mmap_sem PageLocked(page) | 512 | BKL mmap_sem PageLocked(page) |
513 | open: no yes | 513 | open: no yes |
514 | close: no yes | 514 | close: no yes |
515 | fault: no yes | 515 | fault: no yes can return with page locked |
516 | page_mkwrite: no yes no | 516 | page_mkwrite: no yes can return with page locked |
517 | access: no yes | 517 | access: no yes |
518 | 518 | ||
519 | ->page_mkwrite() is called when a previously read-only page is | 519 | ->fault() is called when a previously not present pte is about |
520 | about to become writeable. The file system is responsible for | 520 | to be faulted in. The filesystem must find and return the page associated |
521 | protecting against truncate races. Once appropriate action has been | 521 | with the passed in "pgoff" in the vm_fault structure. If it is possible that |
522 | taking to lock out truncate, the page range should be verified to be | 522 | the page may be truncated and/or invalidated, then the filesystem must lock |
523 | within i_size. The page mapping should also be checked that it is not | 523 | the page, then ensure it is not already truncated (the page lock will block |
524 | NULL. | 524 | subsequent truncate), and then return with VM_FAULT_LOCKED, and the page |
525 | locked. The VM will unlock the page. | ||
526 | |||
527 | ->page_mkwrite() is called when a previously read-only pte is | ||
528 | about to become writeable. The filesystem again must ensure that there are | ||
529 | no truncate/invalidate races, and then return with the page locked. If | ||
530 | the page has been truncated, the filesystem should not look up a new page | ||
531 | like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which | ||
532 | will cause the VM to retry the fault. | ||
525 | 533 | ||
526 | ->access() is called when get_user_pages() fails in | 534 | ->access() is called when get_user_pages() fails in |
527 | acces_process_vm(), typically used to debug a process through | 535 | acces_process_vm(), typically used to debug a process through |
diff --git a/fs/buffer.c b/fs/buffer.c index b3e5be7514f5..aed297739eb0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -2397,7 +2397,8 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
2397 | if ((page->mapping != inode->i_mapping) || | 2397 | if ((page->mapping != inode->i_mapping) || |
2398 | (page_offset(page) > size)) { | 2398 | (page_offset(page) > size)) { |
2399 | /* page got truncated out from underneath us */ | 2399 | /* page got truncated out from underneath us */ |
2400 | goto out_unlock; | 2400 | unlock_page(page); |
2401 | goto out; | ||
2401 | } | 2402 | } |
2402 | 2403 | ||
2403 | /* page is wholly or partially inside EOF */ | 2404 | /* page is wholly or partially inside EOF */ |
@@ -2411,14 +2412,15 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
2411 | ret = block_commit_write(page, 0, end); | 2412 | ret = block_commit_write(page, 0, end); |
2412 | 2413 | ||
2413 | if (unlikely(ret)) { | 2414 | if (unlikely(ret)) { |
2415 | unlock_page(page); | ||
2414 | if (ret == -ENOMEM) | 2416 | if (ret == -ENOMEM) |
2415 | ret = VM_FAULT_OOM; | 2417 | ret = VM_FAULT_OOM; |
2416 | else /* -ENOSPC, -EIO, etc */ | 2418 | else /* -ENOSPC, -EIO, etc */ |
2417 | ret = VM_FAULT_SIGBUS; | 2419 | ret = VM_FAULT_SIGBUS; |
2418 | } | 2420 | } else |
2421 | ret = VM_FAULT_LOCKED; | ||
2419 | 2422 | ||
2420 | out_unlock: | 2423 | out: |
2421 | unlock_page(page); | ||
2422 | return ret; | 2424 | return ret; |
2423 | } | 2425 | } |
2424 | 2426 | ||
diff --git a/mm/memory.c b/mm/memory.c index 6a4ef0fd0711..4126dd16778c 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1971,6 +1971,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1971 | ret = tmp; | 1971 | ret = tmp; |
1972 | goto unwritable_page; | 1972 | goto unwritable_page; |
1973 | } | 1973 | } |
1974 | if (unlikely(!(tmp & VM_FAULT_LOCKED))) { | ||
1975 | lock_page(old_page); | ||
1976 | if (!old_page->mapping) { | ||
1977 | ret = 0; /* retry the fault */ | ||
1978 | unlock_page(old_page); | ||
1979 | goto unwritable_page; | ||
1980 | } | ||
1981 | } else | ||
1982 | VM_BUG_ON(!PageLocked(old_page)); | ||
1974 | 1983 | ||
1975 | /* | 1984 | /* |
1976 | * Since we dropped the lock we need to revalidate | 1985 | * Since we dropped the lock we need to revalidate |
@@ -1980,9 +1989,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1980 | */ | 1989 | */ |
1981 | page_table = pte_offset_map_lock(mm, pmd, address, | 1990 | page_table = pte_offset_map_lock(mm, pmd, address, |
1982 | &ptl); | 1991 | &ptl); |
1983 | page_cache_release(old_page); | 1992 | if (!pte_same(*page_table, orig_pte)) { |
1984 | if (!pte_same(*page_table, orig_pte)) | 1993 | unlock_page(old_page); |
1994 | page_cache_release(old_page); | ||
1985 | goto unlock; | 1995 | goto unlock; |
1996 | } | ||
1986 | 1997 | ||
1987 | page_mkwrite = 1; | 1998 | page_mkwrite = 1; |
1988 | } | 1999 | } |
@@ -2094,9 +2105,6 @@ gotten: | |||
2094 | unlock: | 2105 | unlock: |
2095 | pte_unmap_unlock(page_table, ptl); | 2106 | pte_unmap_unlock(page_table, ptl); |
2096 | if (dirty_page) { | 2107 | if (dirty_page) { |
2097 | if (vma->vm_file) | ||
2098 | file_update_time(vma->vm_file); | ||
2099 | |||
2100 | /* | 2108 | /* |
2101 | * Yes, Virginia, this is actually required to prevent a race | 2109 | * Yes, Virginia, this is actually required to prevent a race |
2102 | * with clear_page_dirty_for_io() from clearing the page dirty | 2110 | * with clear_page_dirty_for_io() from clearing the page dirty |
@@ -2105,16 +2113,41 @@ unlock: | |||
2105 | * | 2113 | * |
2106 | * do_no_page is protected similarly. | 2114 | * do_no_page is protected similarly. |
2107 | */ | 2115 | */ |
2108 | wait_on_page_locked(dirty_page); | 2116 | if (!page_mkwrite) { |
2109 | set_page_dirty_balance(dirty_page, page_mkwrite); | 2117 | wait_on_page_locked(dirty_page); |
2118 | set_page_dirty_balance(dirty_page, page_mkwrite); | ||
2119 | } | ||
2110 | put_page(dirty_page); | 2120 | put_page(dirty_page); |
2121 | if (page_mkwrite) { | ||
2122 | struct address_space *mapping = dirty_page->mapping; | ||
2123 | |||
2124 | set_page_dirty(dirty_page); | ||
2125 | unlock_page(dirty_page); | ||
2126 | page_cache_release(dirty_page); | ||
2127 | if (mapping) { | ||
2128 | /* | ||
2129 | * Some device drivers do not set page.mapping | ||
2130 | * but still dirty their pages | ||
2131 | */ | ||
2132 | balance_dirty_pages_ratelimited(mapping); | ||
2133 | } | ||
2134 | } | ||
2135 | |||
2136 | /* file_update_time outside page_lock */ | ||
2137 | if (vma->vm_file) | ||
2138 | file_update_time(vma->vm_file); | ||
2111 | } | 2139 | } |
2112 | return ret; | 2140 | return ret; |
2113 | oom_free_new: | 2141 | oom_free_new: |
2114 | page_cache_release(new_page); | 2142 | page_cache_release(new_page); |
2115 | oom: | 2143 | oom: |
2116 | if (old_page) | 2144 | if (old_page) { |
2145 | if (page_mkwrite) { | ||
2146 | unlock_page(old_page); | ||
2147 | page_cache_release(old_page); | ||
2148 | } | ||
2117 | page_cache_release(old_page); | 2149 | page_cache_release(old_page); |
2150 | } | ||
2118 | return VM_FAULT_OOM; | 2151 | return VM_FAULT_OOM; |
2119 | 2152 | ||
2120 | unwritable_page: | 2153 | unwritable_page: |
@@ -2664,27 +2697,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2664 | int tmp; | 2697 | int tmp; |
2665 | 2698 | ||
2666 | unlock_page(page); | 2699 | unlock_page(page); |
2667 | vmf.flags |= FAULT_FLAG_MKWRITE; | 2700 | vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; |
2668 | tmp = vma->vm_ops->page_mkwrite(vma, &vmf); | 2701 | tmp = vma->vm_ops->page_mkwrite(vma, &vmf); |
2669 | if (unlikely(tmp & | 2702 | if (unlikely(tmp & |
2670 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { | 2703 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { |
2671 | ret = tmp; | 2704 | ret = tmp; |
2672 | anon = 1; /* no anon but release vmf.page */ | 2705 | goto unwritable_page; |
2673 | goto out_unlocked; | ||
2674 | } | ||
2675 | lock_page(page); | ||
2676 | /* | ||
2677 | * XXX: this is not quite right (racy vs | ||
2678 | * invalidate) to unlock and relock the page | ||
2679 | * like this, however a better fix requires | ||
2680 | * reworking page_mkwrite locking API, which | ||
2681 | * is better done later. | ||
2682 | */ | ||
2683 | if (!page->mapping) { | ||
2684 | ret = 0; | ||
2685 | anon = 1; /* no anon but release vmf.page */ | ||
2686 | goto out; | ||
2687 | } | 2706 | } |
2707 | if (unlikely(!(tmp & VM_FAULT_LOCKED))) { | ||
2708 | lock_page(page); | ||
2709 | if (!page->mapping) { | ||
2710 | ret = 0; /* retry the fault */ | ||
2711 | unlock_page(page); | ||
2712 | goto unwritable_page; | ||
2713 | } | ||
2714 | } else | ||
2715 | VM_BUG_ON(!PageLocked(page)); | ||
2688 | page_mkwrite = 1; | 2716 | page_mkwrite = 1; |
2689 | } | 2717 | } |
2690 | } | 2718 | } |
@@ -2736,19 +2764,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2736 | pte_unmap_unlock(page_table, ptl); | 2764 | pte_unmap_unlock(page_table, ptl); |
2737 | 2765 | ||
2738 | out: | 2766 | out: |
2739 | unlock_page(vmf.page); | 2767 | if (dirty_page) { |
2740 | out_unlocked: | 2768 | struct address_space *mapping = page->mapping; |
2741 | if (anon) | ||
2742 | page_cache_release(vmf.page); | ||
2743 | else if (dirty_page) { | ||
2744 | if (vma->vm_file) | ||
2745 | file_update_time(vma->vm_file); | ||
2746 | 2769 | ||
2747 | set_page_dirty_balance(dirty_page, page_mkwrite); | 2770 | if (set_page_dirty(dirty_page)) |
2771 | page_mkwrite = 1; | ||
2772 | unlock_page(dirty_page); | ||
2748 | put_page(dirty_page); | 2773 | put_page(dirty_page); |
2774 | if (page_mkwrite && mapping) { | ||
2775 | /* | ||
2776 | * Some device drivers do not set page.mapping but still | ||
2777 | * dirty their pages | ||
2778 | */ | ||
2779 | balance_dirty_pages_ratelimited(mapping); | ||
2780 | } | ||
2781 | |||
2782 | /* file_update_time outside page_lock */ | ||
2783 | if (vma->vm_file) | ||
2784 | file_update_time(vma->vm_file); | ||
2785 | } else { | ||
2786 | unlock_page(vmf.page); | ||
2787 | if (anon) | ||
2788 | page_cache_release(vmf.page); | ||
2749 | } | 2789 | } |
2750 | 2790 | ||
2751 | return ret; | 2791 | return ret; |
2792 | |||
2793 | unwritable_page: | ||
2794 | page_cache_release(page); | ||
2795 | return ret; | ||
2752 | } | 2796 | } |
2753 | 2797 | ||
2754 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2798 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |