aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBoaz Harrosh <boaz@plexistor.com>2015-04-15 19:15:11 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-04-15 19:35:20 -0400
commitdd9061846a3ba01b0fa45423aaa087e4a69187fa (patch)
tree799e6c9845a389a95173ca927caa825406175f41
parent2682582a6ea118d974c33da64923ae8c687fdd0b (diff)
mm: new pfn_mkwrite same as page_mkwrite for VM_PFNMAP
This will allow FS that uses VM_PFNMAP | VM_MIXEDMAP (no page structs) to get notified when access is a write to a read-only PFN. This can happen if we mmap() a file then first mmap-read from it to page-in a read-only PFN, than we mmap-write to the same page. We need this functionality to fix a DAX bug, where in the scenario above we fail to set ctime/mtime though we modified the file. An xfstest is attached to this patchset that shows the failure and the fix. (A DAX patch will follow) This functionality is extra important for us, because upon dirtying of a pmem page we also want to RDMA the page to a remote cluster node. We define a new pfn_mkwrite and do not reuse page_mkwrite because 1 - The name ;-) 2 - But mainly because it would take a very long and tedious audit of all page_mkwrite functions of VM_MIXEDMAP/VM_PFNMAP users. To make sure they do not now CRASH. For example current DAX code (which this is for) would crash. If we would want to reuse page_mkwrite, We will need to first patch all users, so to not-crash-on-no-page. Then enable this patch. But even if I did that I would not sleep so well at night. Adding a new vector is the safest thing to do, and is not that expensive. an extra pointer at a static function vector per driver. Also the new vector is better for performance, because else we Will call all current Kernel vectors, so to: check-ha-no-page-do-nothing and return. No need to call it from do_shared_fault because do_wp_page is called to change pte permissions anyway. Signed-off-by: Yigal Korman <yigal@plexistor.com> Signed-off-by: Boaz Harrosh <boaz@plexistor.com> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Matthew Wilcox <matthew.r.wilcox@intel.com> Cc: Jan Kara <jack@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Dave Chinner <david@fromorbit.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/filesystems/Locking8
-rw-r--r--include/linux/mm.h3
-rw-r--r--mm/memory.c43
3 files changed, 50 insertions, 4 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index f91926f2f482..8bb8a7ee0f99 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -525,6 +525,7 @@ prototypes:
525 void (*close)(struct vm_area_struct*); 525 void (*close)(struct vm_area_struct*);
526 int (*fault)(struct vm_area_struct*, struct vm_fault *); 526 int (*fault)(struct vm_area_struct*, struct vm_fault *);
527 int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); 527 int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
528 int (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *);
528 int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); 529 int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
529 530
530locking rules: 531locking rules:
@@ -534,6 +535,7 @@ close: yes
534fault: yes can return with page locked 535fault: yes can return with page locked
535map_pages: yes 536map_pages: yes
536page_mkwrite: yes can return with page locked 537page_mkwrite: yes can return with page locked
538pfn_mkwrite: yes
537access: yes 539access: yes
538 540
539 ->fault() is called when a previously not present pte is about 541 ->fault() is called when a previously not present pte is about
@@ -560,6 +562,12 @@ the page has been truncated, the filesystem should not look up a new page
560like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which 562like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
561will cause the VM to retry the fault. 563will cause the VM to retry the fault.
562 564
565 ->pfn_mkwrite() is the same as page_mkwrite but when the pte is
566VM_PFNMAP or VM_MIXEDMAP with a page-less entry. Expected return is
567VM_FAULT_NOPAGE. Or one of the VM_FAULT_ERROR types. The default behavior
568after this call is to make the pte read-write, unless pfn_mkwrite returns
569an error.
570
563 ->access() is called when get_user_pages() fails in 571 ->access() is called when get_user_pages() fails in
564access_process_vm(), typically used to debug a process through 572access_process_vm(), typically used to debug a process through
565/proc/pid/mem or ptrace. This function is needed only for 573/proc/pid/mem or ptrace. This function is needed only for
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e7bb2194da5..8b086070c3a5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -251,6 +251,9 @@ struct vm_operations_struct {
251 * writable, if an error is returned it will cause a SIGBUS */ 251 * writable, if an error is returned it will cause a SIGBUS */
252 int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); 252 int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
253 253
254 /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
255 int (*pfn_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
256
254 /* called by access_process_vm when get_user_pages() fails, typically 257 /* called by access_process_vm when get_user_pages() fails, typically
255 * for use by special VMAs that can switch between memory and hardware 258 * for use by special VMAs that can switch between memory and hardware
256 */ 259 */
diff --git a/mm/memory.c b/mm/memory.c
index f9628e568c58..22e037e3364e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2180,6 +2180,42 @@ oom:
2180 return VM_FAULT_OOM; 2180 return VM_FAULT_OOM;
2181} 2181}
2182 2182
2183/*
2184 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
2185 * mapping
2186 */
2187static int wp_pfn_shared(struct mm_struct *mm,
2188 struct vm_area_struct *vma, unsigned long address,
2189 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
2190 pmd_t *pmd)
2191{
2192 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2193 struct vm_fault vmf = {
2194 .page = NULL,
2195 .pgoff = linear_page_index(vma, address),
2196 .virtual_address = (void __user *)(address & PAGE_MASK),
2197 .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
2198 };
2199 int ret;
2200
2201 pte_unmap_unlock(page_table, ptl);
2202 ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
2203 if (ret & VM_FAULT_ERROR)
2204 return ret;
2205 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2206 /*
2207 * We might have raced with another page fault while we
2208 * released the pte_offset_map_lock.
2209 */
2210 if (!pte_same(*page_table, orig_pte)) {
2211 pte_unmap_unlock(page_table, ptl);
2212 return 0;
2213 }
2214 }
2215 return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
2216 NULL, 0, 0);
2217}
2218
2183static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, 2219static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2184 unsigned long address, pte_t *page_table, 2220 unsigned long address, pte_t *page_table,
2185 pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, 2221 pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
@@ -2258,13 +2294,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2258 * VM_PFNMAP VMA. 2294 * VM_PFNMAP VMA.
2259 * 2295 *
2260 * We should not cow pages in a shared writeable mapping. 2296 * We should not cow pages in a shared writeable mapping.
2261 * Just mark the pages writable as we can't do any dirty 2297 * Just mark the pages writable and/or call ops->pfn_mkwrite.
2262 * accounting on raw pfn maps.
2263 */ 2298 */
2264 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2299 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2265 (VM_WRITE|VM_SHARED)) 2300 (VM_WRITE|VM_SHARED))
2266 return wp_page_reuse(mm, vma, address, page_table, ptl, 2301 return wp_pfn_shared(mm, vma, address, page_table, ptl,
2267 orig_pte, old_page, 0, 0); 2302 orig_pte, pmd);
2268 2303
2269 pte_unmap_unlock(page_table, ptl); 2304 pte_unmap_unlock(page_table, ptl);
2270 return wp_page_copy(mm, vma, address, page_table, pmd, 2305 return wp_page_copy(mm, vma, address, page_table, pmd,