diff options
author | Boaz Harrosh <boaz@plexistor.com> | 2015-04-15 19:15:11 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-15 19:35:20 -0400 |
commit | dd9061846a3ba01b0fa45423aaa087e4a69187fa (patch) | |
tree | 799e6c9845a389a95173ca927caa825406175f41 | |
parent | 2682582a6ea118d974c33da64923ae8c687fdd0b (diff) |
mm: new pfn_mkwrite same as page_mkwrite for VM_PFNMAP
This will allow FS that uses VM_PFNMAP | VM_MIXEDMAP (no page structs) to
get notified when access is a write to a read-only PFN.
This can happen if we mmap() a file then first mmap-read from it to
page-in a read-only PFN, than we mmap-write to the same page.
We need this functionality to fix a DAX bug, where in the scenario above
we fail to set ctime/mtime though we modified the file. An xfstest is
attached to this patchset that shows the failure and the fix. (A DAX
patch will follow)
This functionality is extra important for us, because upon dirtying of a
pmem page we also want to RDMA the page to a remote cluster node.
We define a new pfn_mkwrite and do not reuse page_mkwrite because
1 - The name ;-)
2 - But mainly because it would take a very long and tedious
audit of all page_mkwrite functions of VM_MIXEDMAP/VM_PFNMAP
users. To make sure they do not now CRASH. For example current
DAX code (which this is for) would crash.
If we would want to reuse page_mkwrite, We will need to first
patch all users, so to not-crash-on-no-page. Then enable this
patch. But even if I did that I would not sleep so well at night.
Adding a new vector is the safest thing to do, and is not that
expensive. an extra pointer at a static function vector per driver.
Also the new vector is better for performance, because else we
Will call all current Kernel vectors, so to:
check-ha-no-page-do-nothing and return.
No need to call it from do_shared_fault because do_wp_page is called to
change pte permissions anyway.
Signed-off-by: Yigal Korman <yigal@plexistor.com>
Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/filesystems/Locking | 8 | ||||
-rw-r--r-- | include/linux/mm.h | 3 | ||||
-rw-r--r-- | mm/memory.c | 43 |
3 files changed, 50 insertions, 4 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index f91926f2f482..8bb8a7ee0f99 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -525,6 +525,7 @@ prototypes: | |||
525 | void (*close)(struct vm_area_struct*); | 525 | void (*close)(struct vm_area_struct*); |
526 | int (*fault)(struct vm_area_struct*, struct vm_fault *); | 526 | int (*fault)(struct vm_area_struct*, struct vm_fault *); |
527 | int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); | 527 | int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); |
528 | int (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *); | ||
528 | int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); | 529 | int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); |
529 | 530 | ||
530 | locking rules: | 531 | locking rules: |
@@ -534,6 +535,7 @@ close: yes | |||
534 | fault: yes can return with page locked | 535 | fault: yes can return with page locked |
535 | map_pages: yes | 536 | map_pages: yes |
536 | page_mkwrite: yes can return with page locked | 537 | page_mkwrite: yes can return with page locked |
538 | pfn_mkwrite: yes | ||
537 | access: yes | 539 | access: yes |
538 | 540 | ||
539 | ->fault() is called when a previously not present pte is about | 541 | ->fault() is called when a previously not present pte is about |
@@ -560,6 +562,12 @@ the page has been truncated, the filesystem should not look up a new page | |||
560 | like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which | 562 | like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which |
561 | will cause the VM to retry the fault. | 563 | will cause the VM to retry the fault. |
562 | 564 | ||
565 | ->pfn_mkwrite() is the same as page_mkwrite but when the pte is | ||
566 | VM_PFNMAP or VM_MIXEDMAP with a page-less entry. Expected return is | ||
567 | VM_FAULT_NOPAGE. Or one of the VM_FAULT_ERROR types. The default behavior | ||
568 | after this call is to make the pte read-write, unless pfn_mkwrite returns | ||
569 | an error. | ||
570 | |||
563 | ->access() is called when get_user_pages() fails in | 571 | ->access() is called when get_user_pages() fails in |
564 | access_process_vm(), typically used to debug a process through | 572 | access_process_vm(), typically used to debug a process through |
565 | /proc/pid/mem or ptrace. This function is needed only for | 573 | /proc/pid/mem or ptrace. This function is needed only for |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 0e7bb2194da5..8b086070c3a5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -251,6 +251,9 @@ struct vm_operations_struct { | |||
251 | * writable, if an error is returned it will cause a SIGBUS */ | 251 | * writable, if an error is returned it will cause a SIGBUS */ |
252 | int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); | 252 | int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); |
253 | 253 | ||
254 | /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ | ||
255 | int (*pfn_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); | ||
256 | |||
254 | /* called by access_process_vm when get_user_pages() fails, typically | 257 | /* called by access_process_vm when get_user_pages() fails, typically |
255 | * for use by special VMAs that can switch between memory and hardware | 258 | * for use by special VMAs that can switch between memory and hardware |
256 | */ | 259 | */ |
diff --git a/mm/memory.c b/mm/memory.c index f9628e568c58..22e037e3364e 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2180,6 +2180,42 @@ oom: | |||
2180 | return VM_FAULT_OOM; | 2180 | return VM_FAULT_OOM; |
2181 | } | 2181 | } |
2182 | 2182 | ||
2183 | /* | ||
2184 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED | ||
2185 | * mapping | ||
2186 | */ | ||
2187 | static int wp_pfn_shared(struct mm_struct *mm, | ||
2188 | struct vm_area_struct *vma, unsigned long address, | ||
2189 | pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, | ||
2190 | pmd_t *pmd) | ||
2191 | { | ||
2192 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { | ||
2193 | struct vm_fault vmf = { | ||
2194 | .page = NULL, | ||
2195 | .pgoff = linear_page_index(vma, address), | ||
2196 | .virtual_address = (void __user *)(address & PAGE_MASK), | ||
2197 | .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, | ||
2198 | }; | ||
2199 | int ret; | ||
2200 | |||
2201 | pte_unmap_unlock(page_table, ptl); | ||
2202 | ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); | ||
2203 | if (ret & VM_FAULT_ERROR) | ||
2204 | return ret; | ||
2205 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
2206 | /* | ||
2207 | * We might have raced with another page fault while we | ||
2208 | * released the pte_offset_map_lock. | ||
2209 | */ | ||
2210 | if (!pte_same(*page_table, orig_pte)) { | ||
2211 | pte_unmap_unlock(page_table, ptl); | ||
2212 | return 0; | ||
2213 | } | ||
2214 | } | ||
2215 | return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, | ||
2216 | NULL, 0, 0); | ||
2217 | } | ||
2218 | |||
2183 | static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | 2219 | static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, |
2184 | unsigned long address, pte_t *page_table, | 2220 | unsigned long address, pte_t *page_table, |
2185 | pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, | 2221 | pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, |
@@ -2258,13 +2294,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2258 | * VM_PFNMAP VMA. | 2294 | * VM_PFNMAP VMA. |
2259 | * | 2295 | * |
2260 | * We should not cow pages in a shared writeable mapping. | 2296 | * We should not cow pages in a shared writeable mapping. |
2261 | * Just mark the pages writable as we can't do any dirty | 2297 | * Just mark the pages writable and/or call ops->pfn_mkwrite. |
2262 | * accounting on raw pfn maps. | ||
2263 | */ | 2298 | */ |
2264 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2299 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2265 | (VM_WRITE|VM_SHARED)) | 2300 | (VM_WRITE|VM_SHARED)) |
2266 | return wp_page_reuse(mm, vma, address, page_table, ptl, | 2301 | return wp_pfn_shared(mm, vma, address, page_table, ptl, |
2267 | orig_pte, old_page, 0, 0); | 2302 | orig_pte, pmd); |
2268 | 2303 | ||
2269 | pte_unmap_unlock(page_table, ptl); | 2304 | pte_unmap_unlock(page_table, ptl); |
2270 | return wp_page_copy(mm, vma, address, page_table, pmd, | 2305 | return wp_page_copy(mm, vma, address, page_table, pmd, |