aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorDavid Howells <dhowells@redhat.com>2006-06-23 05:03:43 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-23 10:42:51 -0400
commit9637a5efd4fbe36164c5ce7f6a0ee68b2bf22b7f (patch)
tree38b86e3e2151e78f952076e36bee4fd7d77e3baf /mm/memory.c
parentbd96b9eb7cfd6ab24ba244360a09980a720874d2 (diff)
[PATCH] add page_mkwrite() vm_operations method
Add a new VMA operation to notify a filesystem or other driver about the MMU generating a fault because userspace attempted to write to a page mapped through a read-only PTE. This facility permits the filesystem or driver to: (*) Implement storage allocation/reservation on attempted write, and so to deal with problems such as ENOSPC more gracefully (perhaps by generating SIGBUS). (*) Delay making the page writable until the contents have been written to a backing cache. This is useful for NFS/AFS when using FS-Cache/CacheFS. It permits the filesystem to have some guarantee about the state of the cache. (*) Account and limit number of dirty pages. This is one piece of the puzzle needed to make shared writable mapping work safely in FUSE. Needed by cachefs (Or is it cachefiles? Or fscache? <head spins>). At least four other groups have stated an interest in it or a desire to use the functionality it provides: FUSE, OCFS2, NTFS and JFFS2. Also, things like EXT3 really ought to use it to deal with the case of shared-writable mmap encountering ENOSPC before we permit the page to be dirtied. From: Peter Zijlstra <a.p.zijlstra@chello.nl> get_user_pages(.write=1, .force=1) can generate COW hits on read-only shared mappings, this patch traps those as mkpage_write candidates and fails to handle them the old way. Signed-off-by: David Howells <dhowells@redhat.com> Cc: Miklos Szeredi <miklos@szeredi.hu> Cc: Joel Becker <Joel.Becker@oracle.com> Cc: Mark Fasheh <mark.fasheh@oracle.com> Cc: Anton Altaparmakov <aia21@cantab.net> Cc: David Woodhouse <dwmw2@infradead.org> Cc: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c100
1 files changed, 76 insertions, 24 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 11673c5d2c20..247b5c312b9b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1457,25 +1457,60 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1457{ 1457{
1458 struct page *old_page, *new_page; 1458 struct page *old_page, *new_page;
1459 pte_t entry; 1459 pte_t entry;
1460 int ret = VM_FAULT_MINOR; 1460 int reuse, ret = VM_FAULT_MINOR;
1461 1461
1462 old_page = vm_normal_page(vma, address, orig_pte); 1462 old_page = vm_normal_page(vma, address, orig_pte);
1463 if (!old_page) 1463 if (!old_page)
1464 goto gotten; 1464 goto gotten;
1465 1465
1466 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { 1466 if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
1467 int reuse = can_share_swap_page(old_page); 1467 (VM_SHARED|VM_WRITE))) {
1468 unlock_page(old_page); 1468 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1469 if (reuse) { 1469 /*
1470 flush_cache_page(vma, address, pte_pfn(orig_pte)); 1470 * Notify the address space that the page is about to
1471 entry = pte_mkyoung(orig_pte); 1471 * become writable so that it can prohibit this or wait
1472 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1472 * for the page to get into an appropriate state.
1473 ptep_set_access_flags(vma, address, page_table, entry, 1); 1473 *
1474 update_mmu_cache(vma, address, entry); 1474 * We do this without the lock held, so that it can
1475 lazy_mmu_prot_update(entry); 1475 * sleep if it needs to.
1476 ret |= VM_FAULT_WRITE; 1476 */
1477 goto unlock; 1477 page_cache_get(old_page);
1478 pte_unmap_unlock(page_table, ptl);
1479
1480 if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
1481 goto unwritable_page;
1482
1483 page_cache_release(old_page);
1484
1485 /*
1486 * Since we dropped the lock we need to revalidate
1487 * the PTE as someone else may have changed it. If
1488 * they did, we just return, as we can count on the
1489 * MMU to tell us if they didn't also make it writable.
1490 */
1491 page_table = pte_offset_map_lock(mm, pmd, address,
1492 &ptl);
1493 if (!pte_same(*page_table, orig_pte))
1494 goto unlock;
1478 } 1495 }
1496
1497 reuse = 1;
1498 } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1499 reuse = can_share_swap_page(old_page);
1500 unlock_page(old_page);
1501 } else {
1502 reuse = 0;
1503 }
1504
1505 if (reuse) {
1506 flush_cache_page(vma, address, pte_pfn(orig_pte));
1507 entry = pte_mkyoung(orig_pte);
1508 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1509 ptep_set_access_flags(vma, address, page_table, entry, 1);
1510 update_mmu_cache(vma, address, entry);
1511 lazy_mmu_prot_update(entry);
1512 ret |= VM_FAULT_WRITE;
1513 goto unlock;
1479 } 1514 }
1480 1515
1481 /* 1516 /*
@@ -1535,6 +1570,10 @@ oom:
1535 if (old_page) 1570 if (old_page)
1536 page_cache_release(old_page); 1571 page_cache_release(old_page);
1537 return VM_FAULT_OOM; 1572 return VM_FAULT_OOM;
1573
1574unwritable_page:
1575 page_cache_release(old_page);
1576 return VM_FAULT_SIGBUS;
1538} 1577}
1539 1578
1540/* 1579/*
@@ -2083,18 +2122,31 @@ retry:
2083 /* 2122 /*
2084 * Should we do an early C-O-W break? 2123 * Should we do an early C-O-W break?
2085 */ 2124 */
2086 if (write_access && !(vma->vm_flags & VM_SHARED)) { 2125 if (write_access) {
2087 struct page *page; 2126 if (!(vma->vm_flags & VM_SHARED)) {
2127 struct page *page;
2088 2128
2089 if (unlikely(anon_vma_prepare(vma))) 2129 if (unlikely(anon_vma_prepare(vma)))
2090 goto oom; 2130 goto oom;
2091 page = alloc_page_vma(GFP_HIGHUSER, vma, address); 2131 page = alloc_page_vma(GFP_HIGHUSER, vma, address);
2092 if (!page) 2132 if (!page)
2093 goto oom; 2133 goto oom;
2094 copy_user_highpage(page, new_page, address); 2134 copy_user_highpage(page, new_page, address);
2095 page_cache_release(new_page); 2135 page_cache_release(new_page);
2096 new_page = page; 2136 new_page = page;
2097 anon = 1; 2137 anon = 1;
2138
2139 } else {
2140 /* if the page will be shareable, see if the backing
2141 * address space wants to know that the page is about
2142 * to become writable */
2143 if (vma->vm_ops->page_mkwrite &&
2144 vma->vm_ops->page_mkwrite(vma, new_page) < 0
2145 ) {
2146 page_cache_release(new_page);
2147 return VM_FAULT_SIGBUS;
2148 }
2149 }
2098 } 2150 }
2099 2151
2100 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2152 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);