aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-02-01 14:00:16 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-02-01 14:00:16 -0500
commit27421e211a39784694b597dbf35848b88363c248 (patch)
tree8d9a06e453c40dea7db3a3d0d9d3d864690427db
parent45c82b5a770be66845687a7d027c8b52946d59af (diff)
Manually revert "mlock: downgrade mmap sem while populating mlocked regions"
This essentially reverts commit 8edb08caf68184fb170f4f69c7445929e199eaea. It downgraded our mmap semaphore to a read-lock while mlocking pages, in order to allow other threads (and external accesses like "ps" et al) to walk the vma lists and take page faults etc. Which is a nice idea, but the implementation does not work. Because we cannot upgrade the lock back to a write lock without releasing the mmap semaphore, the code had to release the lock entirely and then re-take it as a writelock. However, that meant that the caller possibly lost the vma chain that it was following, since now another thread could come in and mmap/munmap the range. The code tried to work around that by just looking up the vma again and erroring out if that happened, but quite frankly, that was just a buggy hack that doesn't actually protect against anything (the other thread could just have replaced the vma with another one instead of totally unmapping it). The only way to downgrade to a read map _reliably_ is to do it at the end, which is likely the right thing to do: do all the 'vma' operations with the write-lock held, then downgrade to a read after completing them all, and then do the "populate the newly mlocked regions" while holding just the read lock. And then just drop the read-lock and return to user space. The (perhaps somewhat simpler) alternative is to just make all the callers of mlock_vma_pages_range() know that the mmap lock got dropped, and just re-grab the mmap semaphore if it needs to mlock more than one vma region. So we can do this "downgrade mmap sem while populating mlocked regions" thing right, but the way it was done here was absolutely not correct. Thus the revert, in the expectation that we will do it all correctly some day. Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Rik van Riel <riel@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: stable@kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/mlock.c47
1 files changed, 2 insertions, 45 deletions
diff --git a/mm/mlock.c b/mm/mlock.c
index 2904a347e476..028ec482fdd4 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -294,14 +294,10 @@ static inline int __mlock_posix_error_return(long retval)
294 * 294 *
295 * return number of pages [> 0] to be removed from locked_vm on success 295 * return number of pages [> 0] to be removed from locked_vm on success
296 * of "special" vmas. 296 * of "special" vmas.
297 *
298 * return negative error if vma spanning @start-@range disappears while
299 * mmap semaphore is dropped. Unlikely?
300 */ 297 */
301long mlock_vma_pages_range(struct vm_area_struct *vma, 298long mlock_vma_pages_range(struct vm_area_struct *vma,
302 unsigned long start, unsigned long end) 299 unsigned long start, unsigned long end)
303{ 300{
304 struct mm_struct *mm = vma->vm_mm;
305 int nr_pages = (end - start) / PAGE_SIZE; 301 int nr_pages = (end - start) / PAGE_SIZE;
306 BUG_ON(!(vma->vm_flags & VM_LOCKED)); 302 BUG_ON(!(vma->vm_flags & VM_LOCKED));
307 303
@@ -314,20 +310,8 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
314 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || 310 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
315 is_vm_hugetlb_page(vma) || 311 is_vm_hugetlb_page(vma) ||
316 vma == get_gate_vma(current))) { 312 vma == get_gate_vma(current))) {
317 long error;
318 downgrade_write(&mm->mmap_sem);
319
320 error = __mlock_vma_pages_range(vma, start, end, 1);
321 313
322 up_read(&mm->mmap_sem); 314 return __mlock_vma_pages_range(vma, start, end, 1);
323 /* vma can change or disappear */
324 down_write(&mm->mmap_sem);
325 vma = find_vma(mm, start);
326 /* non-NULL vma must contain @start, but need to check @end */
327 if (!vma || end > vma->vm_end)
328 return -ENOMEM;
329
330 return 0; /* hide other errors from mmap(), et al */
331 } 315 }
332 316
333 /* 317 /*
@@ -438,41 +422,14 @@ success:
438 vma->vm_flags = newflags; 422 vma->vm_flags = newflags;
439 423
440 if (lock) { 424 if (lock) {
441 /*
442 * mmap_sem is currently held for write. Downgrade the write
443 * lock to a read lock so that other faults, mmap scans, ...
444 * while we fault in all pages.
445 */
446 downgrade_write(&mm->mmap_sem);
447
448 ret = __mlock_vma_pages_range(vma, start, end, 1); 425 ret = __mlock_vma_pages_range(vma, start, end, 1);
449 426
450 /* 427 if (ret > 0) {
451 * Need to reacquire mmap sem in write mode, as our callers
452 * expect this. We have no support for atomically upgrading
453 * a sem to write, so we need to check for ranges while sem
454 * is unlocked.
455 */
456 up_read(&mm->mmap_sem);
457 /* vma can change or disappear */
458 down_write(&mm->mmap_sem);
459 *prev = find_vma(mm, start);
460 /* non-NULL *prev must contain @start, but need to check @end */
461 if (!(*prev) || end > (*prev)->vm_end)
462 ret = -ENOMEM;
463 else if (ret > 0) {
464 mm->locked_vm -= ret; 428 mm->locked_vm -= ret;
465 ret = 0; 429 ret = 0;
466 } else 430 } else
467 ret = __mlock_posix_error_return(ret); /* translate if needed */ 431 ret = __mlock_posix_error_return(ret); /* translate if needed */
468 } else { 432 } else {
469 /*
470 * TODO: for unlocking, pages will already be resident, so
471 * we don't need to wait for allocations/reclaim/pagein, ...
472 * However, unlocking a very large region can still take a
473 * while. Should we downgrade the semaphore for both lock
474 * AND unlock ?
475 */
476 __mlock_vma_pages_range(vma, start, end, 0); 433 __mlock_vma_pages_range(vma, start, end, 0);
477 } 434 }
478 435