diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2006-09-26 02:30:57 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-09-26 11:48:44 -0400 |
commit | d08b3851da41d0ee60851f2c75b118e1f7a5fc89 (patch) | |
tree | a01f6930a1387e8f66607e2fe16c62bb7044353b /mm | |
parent | 725d704ecaca4a43f067092c140d4f3271cf2856 (diff) |
[PATCH] mm: tracking shared dirty pages
Tracking of dirty pages in shared writeable mmap()s.
The idea is simple: write protect clean shared writeable pages, catch the
write-fault, make writeable and set dirty. On page write-back clean all the
PTE dirty bits and write protect them once again.
The implementation is a tad harder, mainly because the default
backing_dev_info capabilities were too loosely maintained. Hence it is not
enough to test the backing_dev_info for cap_account_dirty.
The current heuristic is as follows, a VMA is eligible when:
- its shared writeable
(vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)
- it is not a 'special' mapping
(vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) == 0
- the backing_dev_info is cap_account_dirty
mapping_cap_account_dirty(vma->vm_file->f_mapping)
- f_op->mmap() didn't change the default page protection
Page from remap_pfn_range() are explicitly excluded because their COW
semantics are already horrid enough (see vm_normal_page() in do_wp_page()) and
because they don't have a backing store anyway.
mprotect() is taught about the new behaviour as well. However it overrides
the last condition.
Cleaning the pages on write-back is done with page_mkclean() a new rmap call.
It can be called on any page, but is currently only implemented for mapped
pages, if the page is found the be of a VMA that accounts dirty pages it will
also wrprotect the PTE.
Finally, in fs/buffers.c:try_to_free_buffers(); remove clear_page_dirty() from
under ->private_lock. This seems to be safe, since ->private_lock is used to
serialize access to the buffers, not the page itself. This is needed because
clear_page_dirty() will call into page_mkclean() and would thereby violate
locking order.
[dhowells@redhat.com: Provide a page_mkclean() implementation for NOMMU]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/memory.c | 29 | ||||
-rw-r--r-- | mm/mmap.c | 10 | ||||
-rw-r--r-- | mm/mprotect.c | 21 | ||||
-rw-r--r-- | mm/page-writeback.c | 17 | ||||
-rw-r--r-- | mm/rmap.c | 65 |
5 files changed, 113 insertions, 29 deletions
diff --git a/mm/memory.c b/mm/memory.c index 109e9866237e..fa941b169071 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1458,14 +1458,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1458 | { | 1458 | { |
1459 | struct page *old_page, *new_page; | 1459 | struct page *old_page, *new_page; |
1460 | pte_t entry; | 1460 | pte_t entry; |
1461 | int reuse, ret = VM_FAULT_MINOR; | 1461 | int reuse = 0, ret = VM_FAULT_MINOR; |
1462 | struct page *dirty_page = NULL; | ||
1462 | 1463 | ||
1463 | old_page = vm_normal_page(vma, address, orig_pte); | 1464 | old_page = vm_normal_page(vma, address, orig_pte); |
1464 | if (!old_page) | 1465 | if (!old_page) |
1465 | goto gotten; | 1466 | goto gotten; |
1466 | 1467 | ||
1467 | if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == | 1468 | /* |
1468 | (VM_SHARED|VM_WRITE))) { | 1469 | * Only catch write-faults on shared writable pages, read-only |
1470 | * shared pages can get COWed by get_user_pages(.write=1, .force=1). | ||
1471 | */ | ||
1472 | if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | ||
1473 | (VM_WRITE|VM_SHARED))) { | ||
1469 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 1474 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
1470 | /* | 1475 | /* |
1471 | * Notify the address space that the page is about to | 1476 | * Notify the address space that the page is about to |
@@ -1494,13 +1499,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1494 | if (!pte_same(*page_table, orig_pte)) | 1499 | if (!pte_same(*page_table, orig_pte)) |
1495 | goto unlock; | 1500 | goto unlock; |
1496 | } | 1501 | } |
1497 | 1502 | dirty_page = old_page; | |
1503 | get_page(dirty_page); | ||
1498 | reuse = 1; | 1504 | reuse = 1; |
1499 | } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { | 1505 | } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { |
1500 | reuse = can_share_swap_page(old_page); | 1506 | reuse = can_share_swap_page(old_page); |
1501 | unlock_page(old_page); | 1507 | unlock_page(old_page); |
1502 | } else { | ||
1503 | reuse = 0; | ||
1504 | } | 1508 | } |
1505 | 1509 | ||
1506 | if (reuse) { | 1510 | if (reuse) { |
@@ -1566,6 +1570,10 @@ gotten: | |||
1566 | page_cache_release(old_page); | 1570 | page_cache_release(old_page); |
1567 | unlock: | 1571 | unlock: |
1568 | pte_unmap_unlock(page_table, ptl); | 1572 | pte_unmap_unlock(page_table, ptl); |
1573 | if (dirty_page) { | ||
1574 | set_page_dirty(dirty_page); | ||
1575 | put_page(dirty_page); | ||
1576 | } | ||
1569 | return ret; | 1577 | return ret; |
1570 | oom: | 1578 | oom: |
1571 | if (old_page) | 1579 | if (old_page) |
@@ -2098,6 +2106,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2098 | unsigned int sequence = 0; | 2106 | unsigned int sequence = 0; |
2099 | int ret = VM_FAULT_MINOR; | 2107 | int ret = VM_FAULT_MINOR; |
2100 | int anon = 0; | 2108 | int anon = 0; |
2109 | struct page *dirty_page = NULL; | ||
2101 | 2110 | ||
2102 | pte_unmap(page_table); | 2111 | pte_unmap(page_table); |
2103 | BUG_ON(vma->vm_flags & VM_PFNMAP); | 2112 | BUG_ON(vma->vm_flags & VM_PFNMAP); |
@@ -2192,6 +2201,10 @@ retry: | |||
2192 | } else { | 2201 | } else { |
2193 | inc_mm_counter(mm, file_rss); | 2202 | inc_mm_counter(mm, file_rss); |
2194 | page_add_file_rmap(new_page); | 2203 | page_add_file_rmap(new_page); |
2204 | if (write_access) { | ||
2205 | dirty_page = new_page; | ||
2206 | get_page(dirty_page); | ||
2207 | } | ||
2195 | } | 2208 | } |
2196 | } else { | 2209 | } else { |
2197 | /* One of our sibling threads was faster, back out. */ | 2210 | /* One of our sibling threads was faster, back out. */ |
@@ -2204,6 +2217,10 @@ retry: | |||
2204 | lazy_mmu_prot_update(entry); | 2217 | lazy_mmu_prot_update(entry); |
2205 | unlock: | 2218 | unlock: |
2206 | pte_unmap_unlock(page_table, ptl); | 2219 | pte_unmap_unlock(page_table, ptl); |
2220 | if (dirty_page) { | ||
2221 | set_page_dirty(dirty_page); | ||
2222 | put_page(dirty_page); | ||
2223 | } | ||
2207 | return ret; | 2224 | return ret; |
2208 | oom: | 2225 | oom: |
2209 | page_cache_release(new_page); | 2226 | page_cache_release(new_page); |
@@ -1105,12 +1105,6 @@ munmap_back: | |||
1105 | goto free_vma; | 1105 | goto free_vma; |
1106 | } | 1106 | } |
1107 | 1107 | ||
1108 | /* Don't make the VMA automatically writable if it's shared, but the | ||
1109 | * backer wishes to know when pages are first written to */ | ||
1110 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) | ||
1111 | vma->vm_page_prot = | ||
1112 | protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)]; | ||
1113 | |||
1114 | /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform | 1108 | /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform |
1115 | * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) | 1109 | * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) |
1116 | * that memory reservation must be checked; but that reservation | 1110 | * that memory reservation must be checked; but that reservation |
@@ -1128,6 +1122,10 @@ munmap_back: | |||
1128 | pgoff = vma->vm_pgoff; | 1122 | pgoff = vma->vm_pgoff; |
1129 | vm_flags = vma->vm_flags; | 1123 | vm_flags = vma->vm_flags; |
1130 | 1124 | ||
1125 | if (vma_wants_writenotify(vma)) | ||
1126 | vma->vm_page_prot = | ||
1127 | protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)]; | ||
1128 | |||
1131 | if (!file || !vma_merge(mm, prev, addr, vma->vm_end, | 1129 | if (!file || !vma_merge(mm, prev, addr, vma->vm_end, |
1132 | vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { | 1130 | vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { |
1133 | file = vma->vm_file; | 1131 | file = vma->vm_file; |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 638edabaff71..367b7f6c0637 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -123,8 +123,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
123 | unsigned long oldflags = vma->vm_flags; | 123 | unsigned long oldflags = vma->vm_flags; |
124 | long nrpages = (end - start) >> PAGE_SHIFT; | 124 | long nrpages = (end - start) >> PAGE_SHIFT; |
125 | unsigned long charged = 0; | 125 | unsigned long charged = 0; |
126 | unsigned int mask; | ||
127 | pgprot_t newprot; | ||
128 | pgoff_t pgoff; | 126 | pgoff_t pgoff; |
129 | int error; | 127 | int error; |
130 | 128 | ||
@@ -176,24 +174,21 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
176 | } | 174 | } |
177 | 175 | ||
178 | success: | 176 | success: |
179 | /* Don't make the VMA automatically writable if it's shared, but the | ||
180 | * backer wishes to know when pages are first written to */ | ||
181 | mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED; | ||
182 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) | ||
183 | mask &= ~VM_SHARED; | ||
184 | |||
185 | newprot = protection_map[newflags & mask]; | ||
186 | |||
187 | /* | 177 | /* |
188 | * vm_flags and vm_page_prot are protected by the mmap_sem | 178 | * vm_flags and vm_page_prot are protected by the mmap_sem |
189 | * held in write mode. | 179 | * held in write mode. |
190 | */ | 180 | */ |
191 | vma->vm_flags = newflags; | 181 | vma->vm_flags = newflags; |
192 | vma->vm_page_prot = newprot; | 182 | vma->vm_page_prot = protection_map[newflags & |
183 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; | ||
184 | if (vma_wants_writenotify(vma)) | ||
185 | vma->vm_page_prot = protection_map[newflags & | ||
186 | (VM_READ|VM_WRITE|VM_EXEC)]; | ||
187 | |||
193 | if (is_vm_hugetlb_page(vma)) | 188 | if (is_vm_hugetlb_page(vma)) |
194 | hugetlb_change_protection(vma, start, end, newprot); | 189 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); |
195 | else | 190 | else |
196 | change_protection(vma, start, end, newprot); | 191 | change_protection(vma, start, end, vma->vm_page_prot); |
197 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 192 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
198 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 193 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
199 | return 0; | 194 | return 0; |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 77a0bc4e261a..1c87430b7a25 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/backing-dev.h> | 23 | #include <linux/backing-dev.h> |
24 | #include <linux/blkdev.h> | 24 | #include <linux/blkdev.h> |
25 | #include <linux/mpage.h> | 25 | #include <linux/mpage.h> |
26 | #include <linux/rmap.h> | ||
26 | #include <linux/percpu.h> | 27 | #include <linux/percpu.h> |
27 | #include <linux/notifier.h> | 28 | #include <linux/notifier.h> |
28 | #include <linux/smp.h> | 29 | #include <linux/smp.h> |
@@ -550,7 +551,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) | |||
550 | return 0; | 551 | return 0; |
551 | wbc->for_writepages = 1; | 552 | wbc->for_writepages = 1; |
552 | if (mapping->a_ops->writepages) | 553 | if (mapping->a_ops->writepages) |
553 | ret = mapping->a_ops->writepages(mapping, wbc); | 554 | ret = mapping->a_ops->writepages(mapping, wbc); |
554 | else | 555 | else |
555 | ret = generic_writepages(mapping, wbc); | 556 | ret = generic_writepages(mapping, wbc); |
556 | wbc->for_writepages = 0; | 557 | wbc->for_writepages = 0; |
@@ -712,9 +713,15 @@ int test_clear_page_dirty(struct page *page) | |||
712 | radix_tree_tag_clear(&mapping->page_tree, | 713 | radix_tree_tag_clear(&mapping->page_tree, |
713 | page_index(page), | 714 | page_index(page), |
714 | PAGECACHE_TAG_DIRTY); | 715 | PAGECACHE_TAG_DIRTY); |
715 | if (mapping_cap_account_dirty(mapping)) | ||
716 | __dec_zone_page_state(page, NR_FILE_DIRTY); | ||
717 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 716 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
717 | /* | ||
718 | * We can continue to use `mapping' here because the | ||
719 | * page is locked, which pins the address_space | ||
720 | */ | ||
721 | if (mapping_cap_account_dirty(mapping)) { | ||
722 | page_mkclean(page); | ||
723 | dec_zone_page_state(page, NR_FILE_DIRTY); | ||
724 | } | ||
718 | return 1; | 725 | return 1; |
719 | } | 726 | } |
720 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 727 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
@@ -744,8 +751,10 @@ int clear_page_dirty_for_io(struct page *page) | |||
744 | 751 | ||
745 | if (mapping) { | 752 | if (mapping) { |
746 | if (TestClearPageDirty(page)) { | 753 | if (TestClearPageDirty(page)) { |
747 | if (mapping_cap_account_dirty(mapping)) | 754 | if (mapping_cap_account_dirty(mapping)) { |
755 | page_mkclean(page); | ||
748 | dec_zone_page_state(page, NR_FILE_DIRTY); | 756 | dec_zone_page_state(page, NR_FILE_DIRTY); |
757 | } | ||
749 | return 1; | 758 | return 1; |
750 | } | 759 | } |
751 | return 0; | 760 | return 0; |
@@ -434,6 +434,71 @@ int page_referenced(struct page *page, int is_locked) | |||
434 | return referenced; | 434 | return referenced; |
435 | } | 435 | } |
436 | 436 | ||
437 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | ||
438 | { | ||
439 | struct mm_struct *mm = vma->vm_mm; | ||
440 | unsigned long address; | ||
441 | pte_t *pte, entry; | ||
442 | spinlock_t *ptl; | ||
443 | int ret = 0; | ||
444 | |||
445 | address = vma_address(page, vma); | ||
446 | if (address == -EFAULT) | ||
447 | goto out; | ||
448 | |||
449 | pte = page_check_address(page, mm, address, &ptl); | ||
450 | if (!pte) | ||
451 | goto out; | ||
452 | |||
453 | if (!pte_dirty(*pte) && !pte_write(*pte)) | ||
454 | goto unlock; | ||
455 | |||
456 | entry = ptep_get_and_clear(mm, address, pte); | ||
457 | entry = pte_mkclean(entry); | ||
458 | entry = pte_wrprotect(entry); | ||
459 | ptep_establish(vma, address, pte, entry); | ||
460 | lazy_mmu_prot_update(entry); | ||
461 | ret = 1; | ||
462 | |||
463 | unlock: | ||
464 | pte_unmap_unlock(pte, ptl); | ||
465 | out: | ||
466 | return ret; | ||
467 | } | ||
468 | |||
469 | static int page_mkclean_file(struct address_space *mapping, struct page *page) | ||
470 | { | ||
471 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
472 | struct vm_area_struct *vma; | ||
473 | struct prio_tree_iter iter; | ||
474 | int ret = 0; | ||
475 | |||
476 | BUG_ON(PageAnon(page)); | ||
477 | |||
478 | spin_lock(&mapping->i_mmap_lock); | ||
479 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
480 | if (vma->vm_flags & VM_SHARED) | ||
481 | ret += page_mkclean_one(page, vma); | ||
482 | } | ||
483 | spin_unlock(&mapping->i_mmap_lock); | ||
484 | return ret; | ||
485 | } | ||
486 | |||
487 | int page_mkclean(struct page *page) | ||
488 | { | ||
489 | int ret = 0; | ||
490 | |||
491 | BUG_ON(!PageLocked(page)); | ||
492 | |||
493 | if (page_mapped(page)) { | ||
494 | struct address_space *mapping = page_mapping(page); | ||
495 | if (mapping) | ||
496 | ret = page_mkclean_file(mapping, page); | ||
497 | } | ||
498 | |||
499 | return ret; | ||
500 | } | ||
501 | |||
437 | /** | 502 | /** |
438 | * page_set_anon_rmap - setup new anonymous rmap | 503 | * page_set_anon_rmap - setup new anonymous rmap |
439 | * @page: the page to add the mapping to | 504 | * @page: the page to add the mapping to |