aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2006-09-26 02:30:57 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-09-26 11:48:44 -0400
commitd08b3851da41d0ee60851f2c75b118e1f7a5fc89 (patch)
treea01f6930a1387e8f66607e2fe16c62bb7044353b
parent725d704ecaca4a43f067092c140d4f3271cf2856 (diff)
[PATCH] mm: tracking shared dirty pages
Tracking of dirty pages in shared writeable mmap()s. The idea is simple: write protect clean shared writeable pages, catch the write-fault, make writeable and set dirty. On page write-back clean all the PTE dirty bits and write protect them once again. The implementation is a tad harder, mainly because the default backing_dev_info capabilities were too loosely maintained. Hence it is not enough to test the backing_dev_info for cap_account_dirty. The current heuristic is as follows, a VMA is eligible when: - its shared writeable (vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED) - it is not a 'special' mapping (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) == 0 - the backing_dev_info is cap_account_dirty mapping_cap_account_dirty(vma->vm_file->f_mapping) - f_op->mmap() didn't change the default page protection Page from remap_pfn_range() are explicitly excluded because their COW semantics are already horrid enough (see vm_normal_page() in do_wp_page()) and because they don't have a backing store anyway. mprotect() is taught about the new behaviour as well. However it overrides the last condition. Cleaning the pages on write-back is done with page_mkclean() a new rmap call. It can be called on any page, but is currently only implemented for mapped pages, if the page is found the be of a VMA that accounts dirty pages it will also wrprotect the PTE. Finally, in fs/buffers.c:try_to_free_buffers(); remove clear_page_dirty() from under ->private_lock. This seems to be safe, since ->private_lock is used to serialize access to the buffers, not the page itself. This is needed because clear_page_dirty() will call into page_mkclean() and would thereby violate locking order. [dhowells@redhat.com: Provide a page_mkclean() implementation for NOMMU] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Hugh Dickins <hugh@veritas.com> Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--fs/buffer.c2
-rw-r--r--include/linux/mm.h34
-rw-r--r--include/linux/rmap.h14
-rw-r--r--mm/memory.c29
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mprotect.c21
-rw-r--r--mm/page-writeback.c17
-rw-r--r--mm/rmap.c65
8 files changed, 162 insertions, 30 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 71649ef9b658..3b6d701073e7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2987,6 +2987,7 @@ int try_to_free_buffers(struct page *page)
2987 2987
2988 spin_lock(&mapping->private_lock); 2988 spin_lock(&mapping->private_lock);
2989 ret = drop_buffers(page, &buffers_to_free); 2989 ret = drop_buffers(page, &buffers_to_free);
2990 spin_unlock(&mapping->private_lock);
2990 if (ret) { 2991 if (ret) {
2991 /* 2992 /*
2992 * If the filesystem writes its buffers by hand (eg ext3) 2993 * If the filesystem writes its buffers by hand (eg ext3)
@@ -2998,7 +2999,6 @@ int try_to_free_buffers(struct page *page)
2998 */ 2999 */
2999 clear_page_dirty(page); 3000 clear_page_dirty(page);
3000 } 3001 }
3001 spin_unlock(&mapping->private_lock);
3002out: 3002out:
3003 if (buffers_to_free) { 3003 if (buffers_to_free) {
3004 struct buffer_head *bh = buffers_to_free; 3004 struct buffer_head *bh = buffers_to_free;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7d20b25c58fc..449841413cf1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -15,6 +15,7 @@
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/debug_locks.h> 17#include <linux/debug_locks.h>
18#include <linux/backing-dev.h>
18 19
19struct mempolicy; 20struct mempolicy;
20struct anon_vma; 21struct anon_vma;
@@ -810,6 +811,39 @@ struct shrinker;
810extern struct shrinker *set_shrinker(int, shrinker_t); 811extern struct shrinker *set_shrinker(int, shrinker_t);
811extern void remove_shrinker(struct shrinker *shrinker); 812extern void remove_shrinker(struct shrinker *shrinker);
812 813
814/*
815 * Some shared mappigns will want the pages marked read-only
816 * to track write events. If so, we'll downgrade vm_page_prot
817 * to the private version (using protection_map[] without the
818 * VM_SHARED bit).
819 */
820static inline int vma_wants_writenotify(struct vm_area_struct *vma)
821{
822 unsigned int vm_flags = vma->vm_flags;
823
824 /* If it was private or non-writable, the write bit is already clear */
825 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
826 return 0;
827
828 /* The backer wishes to know when pages are first written to? */
829 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
830 return 1;
831
832 /* The open routine did something to the protections already? */
833 if (pgprot_val(vma->vm_page_prot) !=
834 pgprot_val(protection_map[vm_flags &
835 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]))
836 return 0;
837
838 /* Specialty mapping? */
839 if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
840 return 0;
841
842 /* Can the mapping track the dirty pages? */
843 return vma->vm_file && vma->vm_file->f_mapping &&
844 mapping_cap_account_dirty(vma->vm_file->f_mapping);
845}
846
813extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)); 847extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl));
814 848
815int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); 849int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bf97b0900014..db2c1df4fef9 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -103,6 +103,14 @@ pte_t *page_check_address(struct page *, struct mm_struct *,
103 */ 103 */
104unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); 104unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
105 105
106/*
107 * Cleans the PTEs of shared mappings.
108 * (and since clean PTEs should also be readonly, write protects them too)
109 *
110 * returns the number of cleaned PTEs.
111 */
112int page_mkclean(struct page *);
113
106#else /* !CONFIG_MMU */ 114#else /* !CONFIG_MMU */
107 115
108#define anon_vma_init() do {} while (0) 116#define anon_vma_init() do {} while (0)
@@ -112,6 +120,12 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
112#define page_referenced(page,l) TestClearPageReferenced(page) 120#define page_referenced(page,l) TestClearPageReferenced(page)
113#define try_to_unmap(page, refs) SWAP_FAIL 121#define try_to_unmap(page, refs) SWAP_FAIL
114 122
123static inline int page_mkclean(struct page *page)
124{
125 return 0;
126}
127
128
115#endif /* CONFIG_MMU */ 129#endif /* CONFIG_MMU */
116 130
117/* 131/*
diff --git a/mm/memory.c b/mm/memory.c
index 109e9866237e..fa941b169071 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1458,14 +1458,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1458{ 1458{
1459 struct page *old_page, *new_page; 1459 struct page *old_page, *new_page;
1460 pte_t entry; 1460 pte_t entry;
1461 int reuse, ret = VM_FAULT_MINOR; 1461 int reuse = 0, ret = VM_FAULT_MINOR;
1462 struct page *dirty_page = NULL;
1462 1463
1463 old_page = vm_normal_page(vma, address, orig_pte); 1464 old_page = vm_normal_page(vma, address, orig_pte);
1464 if (!old_page) 1465 if (!old_page)
1465 goto gotten; 1466 goto gotten;
1466 1467
1467 if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == 1468 /*
1468 (VM_SHARED|VM_WRITE))) { 1469 * Only catch write-faults on shared writable pages, read-only
1470 * shared pages can get COWed by get_user_pages(.write=1, .force=1).
1471 */
1472 if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1473 (VM_WRITE|VM_SHARED))) {
1469 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 1474 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1470 /* 1475 /*
1471 * Notify the address space that the page is about to 1476 * Notify the address space that the page is about to
@@ -1494,13 +1499,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1494 if (!pte_same(*page_table, orig_pte)) 1499 if (!pte_same(*page_table, orig_pte))
1495 goto unlock; 1500 goto unlock;
1496 } 1501 }
1497 1502 dirty_page = old_page;
1503 get_page(dirty_page);
1498 reuse = 1; 1504 reuse = 1;
1499 } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { 1505 } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1500 reuse = can_share_swap_page(old_page); 1506 reuse = can_share_swap_page(old_page);
1501 unlock_page(old_page); 1507 unlock_page(old_page);
1502 } else {
1503 reuse = 0;
1504 } 1508 }
1505 1509
1506 if (reuse) { 1510 if (reuse) {
@@ -1566,6 +1570,10 @@ gotten:
1566 page_cache_release(old_page); 1570 page_cache_release(old_page);
1567unlock: 1571unlock:
1568 pte_unmap_unlock(page_table, ptl); 1572 pte_unmap_unlock(page_table, ptl);
1573 if (dirty_page) {
1574 set_page_dirty(dirty_page);
1575 put_page(dirty_page);
1576 }
1569 return ret; 1577 return ret;
1570oom: 1578oom:
1571 if (old_page) 1579 if (old_page)
@@ -2098,6 +2106,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2098 unsigned int sequence = 0; 2106 unsigned int sequence = 0;
2099 int ret = VM_FAULT_MINOR; 2107 int ret = VM_FAULT_MINOR;
2100 int anon = 0; 2108 int anon = 0;
2109 struct page *dirty_page = NULL;
2101 2110
2102 pte_unmap(page_table); 2111 pte_unmap(page_table);
2103 BUG_ON(vma->vm_flags & VM_PFNMAP); 2112 BUG_ON(vma->vm_flags & VM_PFNMAP);
@@ -2192,6 +2201,10 @@ retry:
2192 } else { 2201 } else {
2193 inc_mm_counter(mm, file_rss); 2202 inc_mm_counter(mm, file_rss);
2194 page_add_file_rmap(new_page); 2203 page_add_file_rmap(new_page);
2204 if (write_access) {
2205 dirty_page = new_page;
2206 get_page(dirty_page);
2207 }
2195 } 2208 }
2196 } else { 2209 } else {
2197 /* One of our sibling threads was faster, back out. */ 2210 /* One of our sibling threads was faster, back out. */
@@ -2204,6 +2217,10 @@ retry:
2204 lazy_mmu_prot_update(entry); 2217 lazy_mmu_prot_update(entry);
2205unlock: 2218unlock:
2206 pte_unmap_unlock(page_table, ptl); 2219 pte_unmap_unlock(page_table, ptl);
2220 if (dirty_page) {
2221 set_page_dirty(dirty_page);
2222 put_page(dirty_page);
2223 }
2207 return ret; 2224 return ret;
2208oom: 2225oom:
2209 page_cache_release(new_page); 2226 page_cache_release(new_page);
diff --git a/mm/mmap.c b/mm/mmap.c
index d799d896d74a..8507ee9cd573 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1105,12 +1105,6 @@ munmap_back:
1105 goto free_vma; 1105 goto free_vma;
1106 } 1106 }
1107 1107
1108 /* Don't make the VMA automatically writable if it's shared, but the
1109 * backer wishes to know when pages are first written to */
1110 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
1111 vma->vm_page_prot =
1112 protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
1113
1114 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform 1108 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
1115 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) 1109 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
1116 * that memory reservation must be checked; but that reservation 1110 * that memory reservation must be checked; but that reservation
@@ -1128,6 +1122,10 @@ munmap_back:
1128 pgoff = vma->vm_pgoff; 1122 pgoff = vma->vm_pgoff;
1129 vm_flags = vma->vm_flags; 1123 vm_flags = vma->vm_flags;
1130 1124
1125 if (vma_wants_writenotify(vma))
1126 vma->vm_page_prot =
1127 protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
1128
1131 if (!file || !vma_merge(mm, prev, addr, vma->vm_end, 1129 if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
1132 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { 1130 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
1133 file = vma->vm_file; 1131 file = vma->vm_file;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 638edabaff71..367b7f6c0637 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -123,8 +123,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
123 unsigned long oldflags = vma->vm_flags; 123 unsigned long oldflags = vma->vm_flags;
124 long nrpages = (end - start) >> PAGE_SHIFT; 124 long nrpages = (end - start) >> PAGE_SHIFT;
125 unsigned long charged = 0; 125 unsigned long charged = 0;
126 unsigned int mask;
127 pgprot_t newprot;
128 pgoff_t pgoff; 126 pgoff_t pgoff;
129 int error; 127 int error;
130 128
@@ -176,24 +174,21 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
176 } 174 }
177 175
178success: 176success:
179 /* Don't make the VMA automatically writable if it's shared, but the
180 * backer wishes to know when pages are first written to */
181 mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
182 if (vma->vm_ops && vma->vm_ops->page_mkwrite)
183 mask &= ~VM_SHARED;
184
185 newprot = protection_map[newflags & mask];
186
187 /* 177 /*
188 * vm_flags and vm_page_prot are protected by the mmap_sem 178 * vm_flags and vm_page_prot are protected by the mmap_sem
189 * held in write mode. 179 * held in write mode.
190 */ 180 */
191 vma->vm_flags = newflags; 181 vma->vm_flags = newflags;
192 vma->vm_page_prot = newprot; 182 vma->vm_page_prot = protection_map[newflags &
183 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
184 if (vma_wants_writenotify(vma))
185 vma->vm_page_prot = protection_map[newflags &
186 (VM_READ|VM_WRITE|VM_EXEC)];
187
193 if (is_vm_hugetlb_page(vma)) 188 if (is_vm_hugetlb_page(vma))
194 hugetlb_change_protection(vma, start, end, newprot); 189 hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
195 else 190 else
196 change_protection(vma, start, end, newprot); 191 change_protection(vma, start, end, vma->vm_page_prot);
197 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 192 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
198 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 193 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
199 return 0; 194 return 0;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 77a0bc4e261a..1c87430b7a25 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -23,6 +23,7 @@
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include <linux/blkdev.h> 24#include <linux/blkdev.h>
25#include <linux/mpage.h> 25#include <linux/mpage.h>
26#include <linux/rmap.h>
26#include <linux/percpu.h> 27#include <linux/percpu.h>
27#include <linux/notifier.h> 28#include <linux/notifier.h>
28#include <linux/smp.h> 29#include <linux/smp.h>
@@ -550,7 +551,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
550 return 0; 551 return 0;
551 wbc->for_writepages = 1; 552 wbc->for_writepages = 1;
552 if (mapping->a_ops->writepages) 553 if (mapping->a_ops->writepages)
553 ret = mapping->a_ops->writepages(mapping, wbc); 554 ret = mapping->a_ops->writepages(mapping, wbc);
554 else 555 else
555 ret = generic_writepages(mapping, wbc); 556 ret = generic_writepages(mapping, wbc);
556 wbc->for_writepages = 0; 557 wbc->for_writepages = 0;
@@ -712,9 +713,15 @@ int test_clear_page_dirty(struct page *page)
712 radix_tree_tag_clear(&mapping->page_tree, 713 radix_tree_tag_clear(&mapping->page_tree,
713 page_index(page), 714 page_index(page),
714 PAGECACHE_TAG_DIRTY); 715 PAGECACHE_TAG_DIRTY);
715 if (mapping_cap_account_dirty(mapping))
716 __dec_zone_page_state(page, NR_FILE_DIRTY);
717 write_unlock_irqrestore(&mapping->tree_lock, flags); 716 write_unlock_irqrestore(&mapping->tree_lock, flags);
717 /*
718 * We can continue to use `mapping' here because the
719 * page is locked, which pins the address_space
720 */
721 if (mapping_cap_account_dirty(mapping)) {
722 page_mkclean(page);
723 dec_zone_page_state(page, NR_FILE_DIRTY);
724 }
718 return 1; 725 return 1;
719 } 726 }
720 write_unlock_irqrestore(&mapping->tree_lock, flags); 727 write_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -744,8 +751,10 @@ int clear_page_dirty_for_io(struct page *page)
744 751
745 if (mapping) { 752 if (mapping) {
746 if (TestClearPageDirty(page)) { 753 if (TestClearPageDirty(page)) {
747 if (mapping_cap_account_dirty(mapping)) 754 if (mapping_cap_account_dirty(mapping)) {
755 page_mkclean(page);
748 dec_zone_page_state(page, NR_FILE_DIRTY); 756 dec_zone_page_state(page, NR_FILE_DIRTY);
757 }
749 return 1; 758 return 1;
750 } 759 }
751 return 0; 760 return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index 40158b59729e..e2155d791d99 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -434,6 +434,71 @@ int page_referenced(struct page *page, int is_locked)
434 return referenced; 434 return referenced;
435} 435}
436 436
437static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
438{
439 struct mm_struct *mm = vma->vm_mm;
440 unsigned long address;
441 pte_t *pte, entry;
442 spinlock_t *ptl;
443 int ret = 0;
444
445 address = vma_address(page, vma);
446 if (address == -EFAULT)
447 goto out;
448
449 pte = page_check_address(page, mm, address, &ptl);
450 if (!pte)
451 goto out;
452
453 if (!pte_dirty(*pte) && !pte_write(*pte))
454 goto unlock;
455
456 entry = ptep_get_and_clear(mm, address, pte);
457 entry = pte_mkclean(entry);
458 entry = pte_wrprotect(entry);
459 ptep_establish(vma, address, pte, entry);
460 lazy_mmu_prot_update(entry);
461 ret = 1;
462
463unlock:
464 pte_unmap_unlock(pte, ptl);
465out:
466 return ret;
467}
468
469static int page_mkclean_file(struct address_space *mapping, struct page *page)
470{
471 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
472 struct vm_area_struct *vma;
473 struct prio_tree_iter iter;
474 int ret = 0;
475
476 BUG_ON(PageAnon(page));
477
478 spin_lock(&mapping->i_mmap_lock);
479 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
480 if (vma->vm_flags & VM_SHARED)
481 ret += page_mkclean_one(page, vma);
482 }
483 spin_unlock(&mapping->i_mmap_lock);
484 return ret;
485}
486
487int page_mkclean(struct page *page)
488{
489 int ret = 0;
490
491 BUG_ON(!PageLocked(page));
492
493 if (page_mapped(page)) {
494 struct address_space *mapping = page_mapping(page);
495 if (mapping)
496 ret = page_mkclean_file(mapping, page);
497 }
498
499 return ret;
500}
501
437/** 502/**
438 * page_set_anon_rmap - setup new anonymous rmap 503 * page_set_anon_rmap - setup new anonymous rmap
439 * @page: the page to add the mapping to 504 * @page: the page to add the mapping to