aboutsummaryrefslogtreecommitdiffstats
path: root/mm/msync.c
diff options
context:
space:
mode:
authorAndrew Morton <akpm@osdl.org>2006-03-24 06:18:12 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-24 10:33:26 -0500
commit9c50823eebf7c256b92b4e0f02b5fb30e97788c2 (patch)
tree241b754698c3501355d1061c828ab6d894dba617 /mm/msync.c
parent4741c9fd36b3bcadd37238321c469049da94a4b9 (diff)
[PATCH] msync(): perform dirty page levelling
It seems sensible to perform dirty page throttling in msync: as the application dirties pages we can kick off pdflush early, or even force the msync() caller to perform writeout, or even throttle the msync() caller. The main effect of this is to start disk writeback earlier if we've just discovered that a large amount of pagecache has been dirtied. (Otherwise it wouldn't happen for up to five seconds, next time pdflush wakes up). It also will cause the page-dirtying process to get panalised for dirtying those pages rather than whacking someone else with the problem. We should do this for munmap() and possibly even exit(), too. We drop the mmap_sem while performing the dirty page balancing. It doesn't seem right to hold mmap_sem for that long. Note that this patch only affects MS_ASYNC. MS_SYNC will be syncing all the dirty pages anyway. We note that msync(MS_SYNC) does a full-file-sync inside mmap_sem, and always has. We can fix that up... The patch also tightens up the mmap_sem coverage in sys_msync(): no point in taking it while we perform the incoming arg checking. Cc: Hugh Dickins <hugh@veritas.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm/msync.c')
-rw-r--r--mm/msync.c93
1 files changed, 61 insertions, 32 deletions
diff --git a/mm/msync.c b/mm/msync.c
index 3563a56e1a51..8a66f5d5d4f0 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -12,17 +12,20 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/mman.h> 13#include <linux/mman.h>
14#include <linux/hugetlb.h> 14#include <linux/hugetlb.h>
15#include <linux/writeback.h>
16#include <linux/file.h>
15#include <linux/syscalls.h> 17#include <linux/syscalls.h>
16 18
17#include <asm/pgtable.h> 19#include <asm/pgtable.h>
18#include <asm/tlbflush.h> 20#include <asm/tlbflush.h>
19 21
20static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 22static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
21 unsigned long addr, unsigned long end) 23 unsigned long addr, unsigned long end)
22{ 24{
23 pte_t *pte; 25 pte_t *pte;
24 spinlock_t *ptl; 26 spinlock_t *ptl;
25 int progress = 0; 27 int progress = 0;
28 unsigned long ret = 0;
26 29
27again: 30again:
28 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 31 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -43,58 +46,64 @@ again:
43 if (!page) 46 if (!page)
44 continue; 47 continue;
45 if (ptep_clear_flush_dirty(vma, addr, pte) || 48 if (ptep_clear_flush_dirty(vma, addr, pte) ||
46 page_test_and_clear_dirty(page)) 49 page_test_and_clear_dirty(page))
47 set_page_dirty(page); 50 ret += set_page_dirty(page);
48 progress += 3; 51 progress += 3;
49 } while (pte++, addr += PAGE_SIZE, addr != end); 52 } while (pte++, addr += PAGE_SIZE, addr != end);
50 pte_unmap_unlock(pte - 1, ptl); 53 pte_unmap_unlock(pte - 1, ptl);
51 cond_resched(); 54 cond_resched();
52 if (addr != end) 55 if (addr != end)
53 goto again; 56 goto again;
57 return ret;
54} 58}
55 59
56static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, 60static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
57 unsigned long addr, unsigned long end) 61 pud_t *pud, unsigned long addr, unsigned long end)
58{ 62{
59 pmd_t *pmd; 63 pmd_t *pmd;
60 unsigned long next; 64 unsigned long next;
65 unsigned long ret = 0;
61 66
62 pmd = pmd_offset(pud, addr); 67 pmd = pmd_offset(pud, addr);
63 do { 68 do {
64 next = pmd_addr_end(addr, end); 69 next = pmd_addr_end(addr, end);
65 if (pmd_none_or_clear_bad(pmd)) 70 if (pmd_none_or_clear_bad(pmd))
66 continue; 71 continue;
67 msync_pte_range(vma, pmd, addr, next); 72 ret += msync_pte_range(vma, pmd, addr, next);
68 } while (pmd++, addr = next, addr != end); 73 } while (pmd++, addr = next, addr != end);
74 return ret;
69} 75}
70 76
71static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 77static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
72 unsigned long addr, unsigned long end) 78 pgd_t *pgd, unsigned long addr, unsigned long end)
73{ 79{
74 pud_t *pud; 80 pud_t *pud;
75 unsigned long next; 81 unsigned long next;
82 unsigned long ret = 0;
76 83
77 pud = pud_offset(pgd, addr); 84 pud = pud_offset(pgd, addr);
78 do { 85 do {
79 next = pud_addr_end(addr, end); 86 next = pud_addr_end(addr, end);
80 if (pud_none_or_clear_bad(pud)) 87 if (pud_none_or_clear_bad(pud))
81 continue; 88 continue;
82 msync_pmd_range(vma, pud, addr, next); 89 ret += msync_pmd_range(vma, pud, addr, next);
83 } while (pud++, addr = next, addr != end); 90 } while (pud++, addr = next, addr != end);
91 return ret;
84} 92}
85 93
86static void msync_page_range(struct vm_area_struct *vma, 94static unsigned long msync_page_range(struct vm_area_struct *vma,
87 unsigned long addr, unsigned long end) 95 unsigned long addr, unsigned long end)
88{ 96{
89 pgd_t *pgd; 97 pgd_t *pgd;
90 unsigned long next; 98 unsigned long next;
99 unsigned long ret = 0;
91 100
92 /* For hugepages we can't go walking the page table normally, 101 /* For hugepages we can't go walking the page table normally,
93 * but that's ok, hugetlbfs is memory based, so we don't need 102 * but that's ok, hugetlbfs is memory based, so we don't need
94 * to do anything more on an msync(). 103 * to do anything more on an msync().
95 */ 104 */
96 if (vma->vm_flags & VM_HUGETLB) 105 if (vma->vm_flags & VM_HUGETLB)
97 return; 106 return 0;
98 107
99 BUG_ON(addr >= end); 108 BUG_ON(addr >= end);
100 pgd = pgd_offset(vma->vm_mm, addr); 109 pgd = pgd_offset(vma->vm_mm, addr);
@@ -103,8 +112,9 @@ static void msync_page_range(struct vm_area_struct *vma,
103 next = pgd_addr_end(addr, end); 112 next = pgd_addr_end(addr, end);
104 if (pgd_none_or_clear_bad(pgd)) 113 if (pgd_none_or_clear_bad(pgd))
105 continue; 114 continue;
106 msync_pud_range(vma, pgd, addr, next); 115 ret += msync_pud_range(vma, pgd, addr, next);
107 } while (pgd++, addr = next, addr != end); 116 } while (pgd++, addr = next, addr != end);
117 return ret;
108} 118}
109 119
110/* 120/*
@@ -118,8 +128,9 @@ static void msync_page_range(struct vm_area_struct *vma,
118 * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to 128 * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
119 * applications. 129 * applications.
120 */ 130 */
121static int msync_interval(struct vm_area_struct *vma, 131static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
122 unsigned long addr, unsigned long end, int flags) 132 unsigned long end, int flags,
133 unsigned long *nr_pages_dirtied)
123{ 134{
124 int ret = 0; 135 int ret = 0;
125 struct file *file = vma->vm_file; 136 struct file *file = vma->vm_file;
@@ -128,7 +139,7 @@ static int msync_interval(struct vm_area_struct *vma,
128 return -EBUSY; 139 return -EBUSY;
129 140
130 if (file && (vma->vm_flags & VM_SHARED)) { 141 if (file && (vma->vm_flags & VM_SHARED)) {
131 msync_page_range(vma, addr, end); 142 *nr_pages_dirtied = msync_page_range(vma, addr, end);
132 143
133 if (flags & MS_SYNC) { 144 if (flags & MS_SYNC) {
134 struct address_space *mapping = file->f_mapping; 145 struct address_space *mapping = file->f_mapping;
@@ -157,11 +168,8 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
157 unsigned long end; 168 unsigned long end;
158 struct vm_area_struct *vma; 169 struct vm_area_struct *vma;
159 int unmapped_error, error = -EINVAL; 170 int unmapped_error, error = -EINVAL;
171 int done = 0;
160 172
161 if (flags & MS_SYNC)
162 current->flags |= PF_SYNCWRITE;
163
164 down_read(&current->mm->mmap_sem);
165 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) 173 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
166 goto out; 174 goto out;
167 if (start & ~PAGE_MASK) 175 if (start & ~PAGE_MASK)
@@ -180,13 +188,19 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
180 * If the interval [start,end) covers some unmapped address ranges, 188 * If the interval [start,end) covers some unmapped address ranges,
181 * just ignore them, but return -ENOMEM at the end. 189 * just ignore them, but return -ENOMEM at the end.
182 */ 190 */
191 down_read(&current->mm->mmap_sem);
192 if (flags & MS_SYNC)
193 current->flags |= PF_SYNCWRITE;
183 vma = find_vma(current->mm, start); 194 vma = find_vma(current->mm, start);
184 unmapped_error = 0; 195 unmapped_error = 0;
185 for (;;) { 196 do {
197 unsigned long nr_pages_dirtied = 0;
198 struct file *file;
199
186 /* Still start < end. */ 200 /* Still start < end. */
187 error = -ENOMEM; 201 error = -ENOMEM;
188 if (!vma) 202 if (!vma)
189 goto out; 203 goto out_unlock;
190 /* Here start < vma->vm_end. */ 204 /* Here start < vma->vm_end. */
191 if (start < vma->vm_start) { 205 if (start < vma->vm_start) {
192 unmapped_error = -ENOMEM; 206 unmapped_error = -ENOMEM;
@@ -195,22 +209,37 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
195 /* Here vma->vm_start <= start < vma->vm_end. */ 209 /* Here vma->vm_start <= start < vma->vm_end. */
196 if (end <= vma->vm_end) { 210 if (end <= vma->vm_end) {
197 if (start < end) { 211 if (start < end) {
198 error = msync_interval(vma, start, end, flags); 212 error = msync_interval(vma, start, end, flags,
213 &nr_pages_dirtied);
199 if (error) 214 if (error)
200 goto out; 215 goto out_unlock;
201 } 216 }
202 error = unmapped_error; 217 error = unmapped_error;
203 goto out; 218 done = 1;
219 } else {
220 /* Here vma->vm_start <= start < vma->vm_end < end. */
221 error = msync_interval(vma, start, vma->vm_end, flags,
222 &nr_pages_dirtied);
223 if (error)
224 goto out_unlock;
204 } 225 }
205 /* Here vma->vm_start <= start < vma->vm_end < end. */ 226 file = vma->vm_file;
206 error = msync_interval(vma, start, vma->vm_end, flags);
207 if (error)
208 goto out;
209 start = vma->vm_end; 227 start = vma->vm_end;
210 vma = vma->vm_next; 228 if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
211 } 229 get_file(file);
212out: 230 up_read(&current->mm->mmap_sem);
213 up_read(&current->mm->mmap_sem); 231 balance_dirty_pages_ratelimited_nr(file->f_mapping,
232 nr_pages_dirtied);
233 fput(file);
234 down_read(&current->mm->mmap_sem);
235 vma = find_vma(current->mm, start);
236 } else {
237 vma = vma->vm_next;
238 }
239 } while (!done);
240out_unlock:
214 current->flags &= ~PF_SYNCWRITE; 241 current->flags &= ~PF_SYNCWRITE;
242 up_read(&current->mm->mmap_sem);
243out:
215 return error; 244 return error;
216} 245}