[PATCH] mm: tracking shared dirty pages

Tracking of dirty pages in shared writeable mmap()s. The idea is simple: write protect clean shared writeable pages, catch the write-fault, make writeable and set dirty. On page write-back clean all the PTE dirty bits and write protect them once again. The implementation is a tad harder, mainly because the default backing_dev_info capabilities were too loosely maintained. Hence it is not enough to test the backing_dev_info for cap_account_dirty. The current heuristic is as follows, a VMA is eligible when: - its shared writeable (vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED) - it is not a 'special' mapping (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) == 0 - the backing_dev_info is cap_account_dirty mapping_cap_account_dirty(vma->vm_file->f_mapping) - f_op->mmap() didn't change the default page protection Page from remap_pfn_range() are explicitly excluded because their COW semantics are already horrid enough (see vm_normal_page() in do_wp_page()) and because they don't have a backing store anyway. mprotect() is taught about the new behaviour as well. However it overrides the last condition. Cleaning the pages on write-back is done with page_mkclean() a new rmap call. It can be called on any page, but is currently only implemented for mapped pages, if the page is found the be of a VMA that accounts dirty pages it will also wrprotect the PTE. Finally, in fs/buffers.c:try_to_free_buffers(); remove clear_page_dirty() from under ->private_lock. This seems to be safe, since ->private_lock is used to serialize access to the buffers, not the page itself. This is needed because clear_page_dirty() will call into page_mkclean() and would thereby violate locking order. [dhowells@redhat.com: Provide a page_mkclean() implementation for NOMMU] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Hugh Dickins <hugh@veritas.com> Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2006-09-26 02:30:57 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-09-26 11:48:44 -0400
commit: d08b3851da41d0ee60851f2c75b118e1f7a5fc89 (patch)
tree: a01f6930a1387e8f66607e2fe16c62bb7044353b /mm/memory.c
parent: 725d704ecaca4a43f067092c140d4f3271cf2856 (diff)
1 files changed, 23 insertions, 6 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 109e9866237e..fa941b169071 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1458,14 +1458,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct page *old_page, *new_page;
        pte_t entry;
-        int reuse, ret = VM_FAULT_MINOR;
+        int reuse = 0, ret = VM_FAULT_MINOR;
+        struct page *dirty_page = NULL;
        old_page = vm_normal_page(vma, address, orig_pte);
        if (!old_page)
                goto gotten;
-        if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
+        /*
-                                (VM_SHARED|VM_WRITE))) {
+         * Only catch write-faults on shared writable pages, read-only
+         * shared pages can get COWed by get_user_pages(.write=1, .force=1).
+         */
+        if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+                                        (VM_WRITE|VM_SHARED))) {
                if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                        /*
                         * Notify the address space that the page is about to
@@ -1494,13 +1499,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        if (!pte_same(*page_table, orig_pte))
                                goto unlock;
                }
+                dirty_page = old_page;
+                get_page(dirty_page);
                reuse = 1;
        } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
                reuse = can_share_swap_page(old_page);
                unlock_page(old_page);
-        } else {
-                reuse = 0;
        }
        if (reuse) {
@@ -1566,6 +1570,10 @@ gotten:
                page_cache_release(old_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
+        if (dirty_page) {
+                set_page_dirty(dirty_page);
+                put_page(dirty_page);
+        }
        return ret;
 oom:
        if (old_page)
@@ -2098,6 +2106,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned int sequence = 0;
        int ret = VM_FAULT_MINOR;
        int anon = 0;
+        struct page *dirty_page = NULL;
        pte_unmap(page_table);
        BUG_ON(vma->vm_flags & VM_PFNMAP);
@@ -2192,6 +2201,10 @@ retry:
                } else {
                        inc_mm_counter(mm, file_rss);
                        page_add_file_rmap(new_page);
+                        if (write_access) {
+                                dirty_page = new_page;
+                                get_page(dirty_page);
+                        }
                }
        } else {
                /* One of our sibling threads was faster, back out. */
@@ -2204,6 +2217,10 @@ retry:
        lazy_mmu_prot_update(entry);
 unlock:
        pte_unmap_unlock(page_table, ptl);
+        if (dirty_page) {
+                set_page_dirty(dirty_page);
+                put_page(dirty_page);
+        }
        return ret;
 oom:
        page_cache_release(new_page);
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2006-09-26 02:30:57 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-09-26 11:48:44 -0400
commit	d08b3851da41d0ee60851f2c75b118e1f7a5fc89 (patch)
tree	a01f6930a1387e8f66607e2fe16c62bb7044353b /mm/memory.c
parent	725d704ecaca4a43f067092c140d4f3271cf2856 (diff)

diff --git a/mm/memory.c b/mm/memory.c index 109e9866237e..fa941b169071 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -1458,14 +1458,19 @@ static int do_wp_page(struct mm_struct mm, struct vm_area_struct vma,
1458	{	1458	{
1459	struct page old_page, new_page;	1459	struct page old_page, new_page;
1460	pte_t entry;	1460	pte_t entry;
1461	int reuse, ret = VM_FAULT_MINOR;	1461	int reuse = 0, ret = VM_FAULT_MINOR;
		1462	struct page *dirty_page = NULL;
1462		1463
1463	old_page = vm_normal_page(vma, address, orig_pte);	1464	old_page = vm_normal_page(vma, address, orig_pte);
1464	if (!old_page)	1465	if (!old_page)
1465	goto gotten;	1466	goto gotten;
1466		1467
1467	if (unlikely((vma->vm_flags & (VM_SHARED\|VM_WRITE)) ==	1468	/*
1468	(VM_SHARED\|VM_WRITE))) {	1469	* Only catch write-faults on shared writable pages, read-only
		1470	* shared pages can get COWed by get_user_pages(.write=1, .force=1).
		1471	*/
		1472	if (unlikely((vma->vm_flags & (VM_WRITE\|VM_SHARED)) ==
		1473	(VM_WRITE\|VM_SHARED))) {
1469	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {	1474	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1470	/*	1475	/*
1471	* Notify the address space that the page is about to	1476	* Notify the address space that the page is about to
@@ -1494,13 +1499,12 @@ static int do_wp_page(struct mm_struct mm, struct vm_area_struct vma,
1494	if (!pte_same(*page_table, orig_pte))	1499	if (!pte_same(*page_table, orig_pte))
1495	goto unlock;	1500	goto unlock;
1496	}	1501	}
1497		1502	dirty_page = old_page;
		1503	get_page(dirty_page);
1498	reuse = 1;	1504	reuse = 1;
1499	} else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {	1505	} else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1500	reuse = can_share_swap_page(old_page);	1506	reuse = can_share_swap_page(old_page);
1501	unlock_page(old_page);	1507	unlock_page(old_page);
1502	} else {
1503	reuse = 0;
1504	}	1508	}
1505		1509
1506	if (reuse) {	1510	if (reuse) {
@@ -1566,6 +1570,10 @@ gotten:
1566	page_cache_release(old_page);	1570	page_cache_release(old_page);
1567	unlock:	1571	unlock:
1568	pte_unmap_unlock(page_table, ptl);	1572	pte_unmap_unlock(page_table, ptl);
		1573	if (dirty_page) {
		1574	set_page_dirty(dirty_page);
		1575	put_page(dirty_page);
		1576	}
1569	return ret;	1577	return ret;
1570	oom:	1578	oom:
1571	if (old_page)	1579	if (old_page)
@@ -2098,6 +2106,7 @@ static int do_no_page(struct mm_struct mm, struct vm_area_struct vma,
2098	unsigned int sequence = 0;	2106	unsigned int sequence = 0;
2099	int ret = VM_FAULT_MINOR;	2107	int ret = VM_FAULT_MINOR;
2100	int anon = 0;	2108	int anon = 0;
		2109	struct page *dirty_page = NULL;
2101		2110
2102	pte_unmap(page_table);	2111	pte_unmap(page_table);
2103	BUG_ON(vma->vm_flags & VM_PFNMAP);	2112	BUG_ON(vma->vm_flags & VM_PFNMAP);
@@ -2192,6 +2201,10 @@ retry:
2192	} else {	2201	} else {
2193	inc_mm_counter(mm, file_rss);	2202	inc_mm_counter(mm, file_rss);
2194	page_add_file_rmap(new_page);	2203	page_add_file_rmap(new_page);
		2204	if (write_access) {
		2205	dirty_page = new_page;
		2206	get_page(dirty_page);
		2207	}
2195	}	2208	}
2196	} else {	2209	} else {
2197	/* One of our sibling threads was faster, back out. */	2210	/* One of our sibling threads was faster, back out. */
@@ -2204,6 +2217,10 @@ retry:
2204	lazy_mmu_prot_update(entry);	2217	lazy_mmu_prot_update(entry);
2205	unlock:	2218	unlock:
2206	pte_unmap_unlock(page_table, ptl);	2219	pte_unmap_unlock(page_table, ptl);
		2220	if (dirty_page) {
		2221	set_page_dirty(dirty_page);
		2222	put_page(dirty_page);
		2223	}
2207	return ret;	2224	return ret;
2208	oom:	2225	oom:
2209	page_cache_release(new_page);	2226	page_cache_release(new_page);