aboutsummaryrefslogtreecommitdiffstats
path: root/mm/fremap.c
diff options
context:
space:
mode:
authorMichel Lespinasse <walken@google.com>2013-02-22 19:32:36 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-23 20:50:10 -0500
commit940e7da5163029978c2f6b5bbe213607add59062 (patch)
tree1a7a107fa1ca7d753da08f0e0e059a22ddcb44e8 /mm/fremap.c
parentdafcb73e385e39b9a7ebd5c4ecbc4ae921862eb9 (diff)
mm: remap_file_pages() fixes
We have many vma manipulation functions that are fast in the typical case, but can optionally be instructed to populate an unbounded number of ptes within the region they work on: - mmap with MAP_POPULATE or MAP_LOCKED flags; - remap_file_pages() with MAP_NONBLOCK not set or when working on a VM_LOCKED vma; - mmap_region() and all its wrappers when mlock(MCL_FUTURE) is in effect; - brk() when mlock(MCL_FUTURE) is in effect. Current code handles these pte operations locally, while the sourrounding code has to hold the mmap_sem write side since it's manipulating vmas. This means we're doing an unbounded amount of pte population work with mmap_sem held, and this causes problems as Andy Lutomirski reported (we've hit this at Google as well, though it's not entirely clear why people keep trying to use mlock(MCL_FUTURE) in the first place). I propose introducing a new mm_populate() function to do this pte population work after the mmap_sem has been released. mm_populate() does need to acquire the mmap_sem read side, but critically, it doesn't need to hold it continuously for the entire duration of the operation - it can drop it whenever things take too long (such as when hitting disk for a file read) and re-acquire it later on. The following patches are included - Patches 1 fixes some issues I noticed while working on the existing code. If needed, they could potentially go in before the rest of the patches. - Patch 2 introduces the new mm_populate() function and changes mmap_region() call sites to use it after they drop mmap_sem. This is inspired from Andy Lutomirski's proposal and is built as an extension of the work I had previously done for mlock() and mlockall() around v2.6.38-rc1. I had tried doing something similar at the time but had given up as there were so many do_mmap() call sites; the recent cleanups by Linus and Viro are a tremendous help here. - Patches 3-5 convert some of the less-obvious places doing unbounded pte populates to the new mm_populate() mechanism. - Patches 6-7 are code cleanups that are made possible by the mm_populate() work. In particular, they remove more code than the entire patch series added, which should be a good thing :) - Patch 8 is optional to this entire series. It only helps to deal more nicely with racy userspace programs that might modify their mappings while we're trying to populate them. It adds a new VM_POPULATE flag on the mappings we do want to populate, so that if userspace replaces them with mappings it doesn't want populated, mm_populate() won't populate those replacement mappings. This patch: Assorted small fixes. The first two are quite small: - Move check for vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR) within existing if (!(vma->vm_flags & VM_NONLINEAR)) block. Purely cosmetic. - In the VM_LOCKED case, when dropping PG_Mlocked for the over-mapped range, make sure we own the mmap_sem write lock around the munlock_vma_pages_range call as this manipulates the vma's vm_flags. Last fix requires a longer explanation. remap_file_pages() can do its work either through VM_NONLINEAR manipulation or by creating extra vmas. These two cases were inconsistent with each other (and ultimately, both wrong) as to exactly when did they fault in the newly mapped file pages: - In the VM_NONLINEAR case, new file pages would be populated if the MAP_NONBLOCK flag wasn't passed. If MAP_NONBLOCK was passed, new file pages wouldn't be populated even if the vma is already marked as VM_LOCKED. - In the linear (emulated) case, the work is passed to the mmap_region() function which would populate the pages if the vma is marked as VM_LOCKED, and would not otherwise - regardless of the value of the MAP_NONBLOCK flag, because MAP_POPULATE wasn't being passed to mmap_region(). The desired behavior is that we want the pages to be populated and locked if the vma is marked as VM_LOCKED, or to be populated if the MAP_NONBLOCK flag is not passed to remap_file_pages(). Signed-off-by: Michel Lespinasse <walken@google.com> Acked-by: Rik van Riel <riel@redhat.com> Tested-by: Andy Lutomirski <luto@amacapital.net> Cc: Greg Ungerer <gregungerer@westnet.com.au> Cc: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/fremap.c')
-rw-r--r--mm/fremap.c22
1 files changed, 14 insertions, 8 deletions
diff --git a/mm/fremap.c b/mm/fremap.c
index a0aaf0e56800..2db886e31044 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -160,15 +160,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
160 /* 160 /*
161 * Make sure the vma is shared, that it supports prefaulting, 161 * Make sure the vma is shared, that it supports prefaulting,
162 * and that the remapped range is valid and fully within 162 * and that the remapped range is valid and fully within
163 * the single existing vma. vm_private_data is used as a 163 * the single existing vma.
164 * swapout cursor in a VM_NONLINEAR vma.
165 */ 164 */
166 if (!vma || !(vma->vm_flags & VM_SHARED)) 165 if (!vma || !(vma->vm_flags & VM_SHARED))
167 goto out; 166 goto out;
168 167
169 if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
170 goto out;
171
172 if (!vma->vm_ops || !vma->vm_ops->remap_pages) 168 if (!vma->vm_ops || !vma->vm_ops->remap_pages)
173 goto out; 169 goto out;
174 170
@@ -177,6 +173,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
177 173
178 /* Must set VM_NONLINEAR before any pages are populated. */ 174 /* Must set VM_NONLINEAR before any pages are populated. */
179 if (!(vma->vm_flags & VM_NONLINEAR)) { 175 if (!(vma->vm_flags & VM_NONLINEAR)) {
176 /*
177 * vm_private_data is used as a swapout cursor
178 * in a VM_NONLINEAR vma.
179 */
180 if (vma->vm_private_data)
181 goto out;
182
180 /* Don't need a nonlinear mapping, exit success */ 183 /* Don't need a nonlinear mapping, exit success */
181 if (pgoff == linear_page_index(vma, start)) { 184 if (pgoff == linear_page_index(vma, start)) {
182 err = 0; 185 err = 0;
@@ -184,6 +187,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
184 } 187 }
185 188
186 if (!has_write_lock) { 189 if (!has_write_lock) {
190get_write_lock:
187 up_read(&mm->mmap_sem); 191 up_read(&mm->mmap_sem);
188 down_write(&mm->mmap_sem); 192 down_write(&mm->mmap_sem);
189 has_write_lock = 1; 193 has_write_lock = 1;
@@ -199,7 +203,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
199 unsigned long addr; 203 unsigned long addr;
200 struct file *file = get_file(vma->vm_file); 204 struct file *file = get_file(vma->vm_file);
201 205
202 flags &= MAP_NONBLOCK; 206 flags = (flags & MAP_NONBLOCK) | MAP_POPULATE;
203 addr = mmap_region(file, start, size, 207 addr = mmap_region(file, start, size,
204 flags, vma->vm_flags, pgoff); 208 flags, vma->vm_flags, pgoff);
205 fput(file); 209 fput(file);
@@ -225,6 +229,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
225 * drop PG_Mlocked flag for over-mapped range 229 * drop PG_Mlocked flag for over-mapped range
226 */ 230 */
227 vm_flags_t saved_flags = vma->vm_flags; 231 vm_flags_t saved_flags = vma->vm_flags;
232 if (!has_write_lock)
233 goto get_write_lock;
228 munlock_vma_pages_range(vma, start, start + size); 234 munlock_vma_pages_range(vma, start, start + size);
229 vma->vm_flags = saved_flags; 235 vma->vm_flags = saved_flags;
230 } 236 }
@@ -232,13 +238,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
232 mmu_notifier_invalidate_range_start(mm, start, start + size); 238 mmu_notifier_invalidate_range_start(mm, start, start + size);
233 err = vma->vm_ops->remap_pages(vma, start, size, pgoff); 239 err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
234 mmu_notifier_invalidate_range_end(mm, start, start + size); 240 mmu_notifier_invalidate_range_end(mm, start, start + size);
235 if (!err && !(flags & MAP_NONBLOCK)) { 241 if (!err) {
236 if (vma->vm_flags & VM_LOCKED) { 242 if (vma->vm_flags & VM_LOCKED) {
237 /* 243 /*
238 * might be mapping previously unmapped range of file 244 * might be mapping previously unmapped range of file
239 */ 245 */
240 mlock_vma_pages_range(vma, start, start + size); 246 mlock_vma_pages_range(vma, start, start + size);
241 } else { 247 } else if (!(flags & MAP_NONBLOCK)) {
242 if (unlikely(has_write_lock)) { 248 if (unlikely(has_write_lock)) {
243 downgrade_write(&mm->mmap_sem); 249 downgrade_write(&mm->mmap_sem);
244 has_write_lock = 0; 250 has_write_lock = 0;