aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDavid Howells <dhowells@redhat.com>2006-06-23 05:03:43 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-23 10:42:51 -0400
commit9637a5efd4fbe36164c5ce7f6a0ee68b2bf22b7f (patch)
tree38b86e3e2151e78f952076e36bee4fd7d77e3baf /mm
parentbd96b9eb7cfd6ab24ba244360a09980a720874d2 (diff)
[PATCH] add page_mkwrite() vm_operations method
Add a new VMA operation to notify a filesystem or other driver about the MMU generating a fault because userspace attempted to write to a page mapped through a read-only PTE. This facility permits the filesystem or driver to: (*) Implement storage allocation/reservation on attempted write, and so to deal with problems such as ENOSPC more gracefully (perhaps by generating SIGBUS). (*) Delay making the page writable until the contents have been written to a backing cache. This is useful for NFS/AFS when using FS-Cache/CacheFS. It permits the filesystem to have some guarantee about the state of the cache. (*) Account and limit number of dirty pages. This is one piece of the puzzle needed to make shared writable mapping work safely in FUSE. Needed by cachefs (Or is it cachefiles? Or fscache? <head spins>). At least four other groups have stated an interest in it or a desire to use the functionality it provides: FUSE, OCFS2, NTFS and JFFS2. Also, things like EXT3 really ought to use it to deal with the case of shared-writable mmap encountering ENOSPC before we permit the page to be dirtied. From: Peter Zijlstra <a.p.zijlstra@chello.nl> get_user_pages(.write=1, .force=1) can generate COW hits on read-only shared mappings, this patch traps those as mkpage_write candidates and fails to handle them the old way. Signed-off-by: David Howells <dhowells@redhat.com> Cc: Miklos Szeredi <miklos@szeredi.hu> Cc: Joel Becker <Joel.Becker@oracle.com> Cc: Mark Fasheh <mark.fasheh@oracle.com> Cc: Anton Altaparmakov <aia21@cantab.net> Cc: David Woodhouse <dwmw2@infradead.org> Cc: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memory.c100
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/mprotect.c11
3 files changed, 95 insertions, 28 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 11673c5d2c20..247b5c312b9b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1457,25 +1457,60 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1457{ 1457{
1458 struct page *old_page, *new_page; 1458 struct page *old_page, *new_page;
1459 pte_t entry; 1459 pte_t entry;
1460 int ret = VM_FAULT_MINOR; 1460 int reuse, ret = VM_FAULT_MINOR;
1461 1461
1462 old_page = vm_normal_page(vma, address, orig_pte); 1462 old_page = vm_normal_page(vma, address, orig_pte);
1463 if (!old_page) 1463 if (!old_page)
1464 goto gotten; 1464 goto gotten;
1465 1465
1466 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { 1466 if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
1467 int reuse = can_share_swap_page(old_page); 1467 (VM_SHARED|VM_WRITE))) {
1468 unlock_page(old_page); 1468 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1469 if (reuse) { 1469 /*
1470 flush_cache_page(vma, address, pte_pfn(orig_pte)); 1470 * Notify the address space that the page is about to
1471 entry = pte_mkyoung(orig_pte); 1471 * become writable so that it can prohibit this or wait
1472 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1472 * for the page to get into an appropriate state.
1473 ptep_set_access_flags(vma, address, page_table, entry, 1); 1473 *
1474 update_mmu_cache(vma, address, entry); 1474 * We do this without the lock held, so that it can
1475 lazy_mmu_prot_update(entry); 1475 * sleep if it needs to.
1476 ret |= VM_FAULT_WRITE; 1476 */
1477 goto unlock; 1477 page_cache_get(old_page);
1478 pte_unmap_unlock(page_table, ptl);
1479
1480 if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
1481 goto unwritable_page;
1482
1483 page_cache_release(old_page);
1484
1485 /*
1486 * Since we dropped the lock we need to revalidate
1487 * the PTE as someone else may have changed it. If
1488 * they did, we just return, as we can count on the
1489 * MMU to tell us if they didn't also make it writable.
1490 */
1491 page_table = pte_offset_map_lock(mm, pmd, address,
1492 &ptl);
1493 if (!pte_same(*page_table, orig_pte))
1494 goto unlock;
1478 } 1495 }
1496
1497 reuse = 1;
1498 } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1499 reuse = can_share_swap_page(old_page);
1500 unlock_page(old_page);
1501 } else {
1502 reuse = 0;
1503 }
1504
1505 if (reuse) {
1506 flush_cache_page(vma, address, pte_pfn(orig_pte));
1507 entry = pte_mkyoung(orig_pte);
1508 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1509 ptep_set_access_flags(vma, address, page_table, entry, 1);
1510 update_mmu_cache(vma, address, entry);
1511 lazy_mmu_prot_update(entry);
1512 ret |= VM_FAULT_WRITE;
1513 goto unlock;
1479 } 1514 }
1480 1515
1481 /* 1516 /*
@@ -1535,6 +1570,10 @@ oom:
1535 if (old_page) 1570 if (old_page)
1536 page_cache_release(old_page); 1571 page_cache_release(old_page);
1537 return VM_FAULT_OOM; 1572 return VM_FAULT_OOM;
1573
1574unwritable_page:
1575 page_cache_release(old_page);
1576 return VM_FAULT_SIGBUS;
1538} 1577}
1539 1578
1540/* 1579/*
@@ -2083,18 +2122,31 @@ retry:
2083 /* 2122 /*
2084 * Should we do an early C-O-W break? 2123 * Should we do an early C-O-W break?
2085 */ 2124 */
2086 if (write_access && !(vma->vm_flags & VM_SHARED)) { 2125 if (write_access) {
2087 struct page *page; 2126 if (!(vma->vm_flags & VM_SHARED)) {
2127 struct page *page;
2088 2128
2089 if (unlikely(anon_vma_prepare(vma))) 2129 if (unlikely(anon_vma_prepare(vma)))
2090 goto oom; 2130 goto oom;
2091 page = alloc_page_vma(GFP_HIGHUSER, vma, address); 2131 page = alloc_page_vma(GFP_HIGHUSER, vma, address);
2092 if (!page) 2132 if (!page)
2093 goto oom; 2133 goto oom;
2094 copy_user_highpage(page, new_page, address); 2134 copy_user_highpage(page, new_page, address);
2095 page_cache_release(new_page); 2135 page_cache_release(new_page);
2096 new_page = page; 2136 new_page = page;
2097 anon = 1; 2137 anon = 1;
2138
2139 } else {
2140 /* if the page will be shareable, see if the backing
2141 * address space wants to know that the page is about
2142 * to become writable */
2143 if (vma->vm_ops->page_mkwrite &&
2144 vma->vm_ops->page_mkwrite(vma, new_page) < 0
2145 ) {
2146 page_cache_release(new_page);
2147 return VM_FAULT_SIGBUS;
2148 }
2149 }
2098 } 2150 }
2099 2151
2100 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2152 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
diff --git a/mm/mmap.c b/mm/mmap.c
index e6ee12344b13..6446c6134b04 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1065,7 +1065,8 @@ munmap_back:
1065 vma->vm_start = addr; 1065 vma->vm_start = addr;
1066 vma->vm_end = addr + len; 1066 vma->vm_end = addr + len;
1067 vma->vm_flags = vm_flags; 1067 vma->vm_flags = vm_flags;
1068 vma->vm_page_prot = protection_map[vm_flags & 0x0f]; 1068 vma->vm_page_prot = protection_map[vm_flags &
1069 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
1069 vma->vm_pgoff = pgoff; 1070 vma->vm_pgoff = pgoff;
1070 1071
1071 if (file) { 1072 if (file) {
@@ -1089,6 +1090,12 @@ munmap_back:
1089 goto free_vma; 1090 goto free_vma;
1090 } 1091 }
1091 1092
1093 /* Don't make the VMA automatically writable if it's shared, but the
1094 * backer wishes to know when pages are first written to */
1095 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
1096 vma->vm_page_prot =
1097 protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
1098
1092 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform 1099 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
1093 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) 1100 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
1094 * that memory reservation must be checked; but that reservation 1101 * that memory reservation must be checked; but that reservation
@@ -1921,7 +1928,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
1921 vma->vm_end = addr + len; 1928 vma->vm_end = addr + len;
1922 vma->vm_pgoff = pgoff; 1929 vma->vm_pgoff = pgoff;
1923 vma->vm_flags = flags; 1930 vma->vm_flags = flags;
1924 vma->vm_page_prot = protection_map[flags & 0x0f]; 1931 vma->vm_page_prot = protection_map[flags &
1932 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
1925 vma_link(mm, vma, prev, rb_link, rb_parent); 1933 vma_link(mm, vma, prev, rb_link, rb_parent);
1926out: 1934out:
1927 mm->total_vm += len >> PAGE_SHIFT; 1935 mm->total_vm += len >> PAGE_SHIFT;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 14f93e62270f..638edabaff71 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -123,6 +123,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
123 unsigned long oldflags = vma->vm_flags; 123 unsigned long oldflags = vma->vm_flags;
124 long nrpages = (end - start) >> PAGE_SHIFT; 124 long nrpages = (end - start) >> PAGE_SHIFT;
125 unsigned long charged = 0; 125 unsigned long charged = 0;
126 unsigned int mask;
126 pgprot_t newprot; 127 pgprot_t newprot;
127 pgoff_t pgoff; 128 pgoff_t pgoff;
128 int error; 129 int error;
@@ -149,8 +150,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
149 } 150 }
150 } 151 }
151 152
152 newprot = protection_map[newflags & 0xf];
153
154 /* 153 /*
155 * First try to merge with previous and/or next vma. 154 * First try to merge with previous and/or next vma.
156 */ 155 */
@@ -177,6 +176,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
177 } 176 }
178 177
179success: 178success:
179 /* Don't make the VMA automatically writable if it's shared, but the
180 * backer wishes to know when pages are first written to */
181 mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
182 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
183 mask &= ~VM_SHARED;
184
185 newprot = protection_map[newflags & mask];
186
180 /* 187 /*
181 * vm_flags and vm_page_prot are protected by the mmap_sem 188 * vm_flags and vm_page_prot are protected by the mmap_sem
182 * held in write mode. 189 * held in write mode.