aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mlock.c
diff options
context:
space:
mode:
authorMichel Lespinasse <walken@google.com>2011-01-13 18:46:10 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 20:32:35 -0500
commitfed067da46ad3b9acedaf794a5f05d0bc153280b (patch)
treeba2eadd8c3c2884d15a5a9373924b33b40bdca7e /mm/mlock.c
parent5ecfda041e4b4bd858d25bbf5a16c2a6c06d7272 (diff)
mlock: only hold mmap_sem in shared mode when faulting in pages
Currently mlock() holds mmap_sem in exclusive mode while the pages get faulted in. In the case of a large mlock, this can potentially take a very long time, during which various commands such as 'ps auxw' will block. This makes sysadmins unhappy: real 14m36.232s user 0m0.003s sys 0m0.015s (output from 'time ps auxw' while a 20GB file was being mlocked without being previously preloaded into page cache) I propose that mlock() could release mmap_sem after the VM_LOCKED bits have been set in all appropriate VMAs. Then a second pass could be done to actually mlock the pages, in small batches, releasing mmap_sem when we block on disk access or when we detect some contention. This patch: Before this change, mlock() holds mmap_sem in exclusive mode while the pages get faulted in. In the case of a large mlock, this can potentially take a very long time. Various things will block while mmap_sem is held, including 'ps auxw'. This can make sysadmins angry. I propose that mlock() could release mmap_sem after the VM_LOCKED bits have been set in all appropriate VMAs. Then a second pass could be done to actually mlock the pages with mmap_sem held for reads only. We need to recheck the vma flags after we re-acquire mmap_sem, but this is easy. In the case where a vma has been munlocked before mlock completes, pages that were already marked as PageMlocked() are handled by the munlock() call, and mlock() is careful to not mark new page batches as PageMlocked() after the munlock() call has cleared the VM_LOCKED vma flags. So, the end result will be identical to what'd happen if munlock() had executed after the mlock() call. In a later change, I will allow the second pass to release mmap_sem when blocking on disk accesses or when it is otherwise contended, so that it won't be held for long periods of time even in shared mode. Signed-off-by: Michel Lespinasse <walken@google.com> Tested-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Cc: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Nick Piggin <npiggin@kernel.dk> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mlock.c')
-rw-r--r--mm/mlock.c81
1 files changed, 64 insertions, 17 deletions
diff --git a/mm/mlock.c b/mm/mlock.c
index 4f318642fbb..67b3dd8616d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -377,18 +377,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
377 int ret = 0; 377 int ret = 0;
378 int lock = newflags & VM_LOCKED; 378 int lock = newflags & VM_LOCKED;
379 379
380 if (newflags == vma->vm_flags || 380 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
381 (vma->vm_flags & (VM_IO | VM_PFNMAP))) 381 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))
382 goto out; /* don't set VM_LOCKED, don't count */ 382 goto out; /* don't set VM_LOCKED, don't count */
383 383
384 if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
385 is_vm_hugetlb_page(vma) ||
386 vma == get_gate_vma(current)) {
387 if (lock)
388 make_pages_present(start, end);
389 goto out; /* don't set VM_LOCKED, don't count */
390 }
391
392 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 384 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
393 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, 385 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
394 vma->vm_file, pgoff, vma_policy(vma)); 386 vma->vm_file, pgoff, vma_policy(vma));
@@ -424,14 +416,10 @@ success:
424 * set VM_LOCKED, __mlock_vma_pages_range will bring it back. 416 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
425 */ 417 */
426 418
427 if (lock) { 419 if (lock)
428 vma->vm_flags = newflags; 420 vma->vm_flags = newflags;
429 ret = __mlock_vma_pages_range(vma, start, end); 421 else
430 if (ret < 0)
431 ret = __mlock_posix_error_return(ret);
432 } else {
433 munlock_vma_pages_range(vma, start, end); 422 munlock_vma_pages_range(vma, start, end);
434 }
435 423
436out: 424out:
437 *prev = vma; 425 *prev = vma;
@@ -444,7 +432,8 @@ static int do_mlock(unsigned long start, size_t len, int on)
444 struct vm_area_struct * vma, * prev; 432 struct vm_area_struct * vma, * prev;
445 int error; 433 int error;
446 434
447 len = PAGE_ALIGN(len); 435 VM_BUG_ON(start & ~PAGE_MASK);
436 VM_BUG_ON(len != PAGE_ALIGN(len));
448 end = start + len; 437 end = start + len;
449 if (end < start) 438 if (end < start)
450 return -EINVAL; 439 return -EINVAL;
@@ -487,6 +476,58 @@ static int do_mlock(unsigned long start, size_t len, int on)
487 return error; 476 return error;
488} 477}
489 478
479static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
480{
481 struct mm_struct *mm = current->mm;
482 unsigned long end, nstart, nend;
483 struct vm_area_struct *vma = NULL;
484 int ret = 0;
485
486 VM_BUG_ON(start & ~PAGE_MASK);
487 VM_BUG_ON(len != PAGE_ALIGN(len));
488 end = start + len;
489
490 down_read(&mm->mmap_sem);
491 for (nstart = start; nstart < end; nstart = nend) {
492 /*
493 * We want to fault in pages for [nstart; end) address range.
494 * Find first corresponding VMA.
495 */
496 if (!vma)
497 vma = find_vma(mm, nstart);
498 else
499 vma = vma->vm_next;
500 if (!vma || vma->vm_start >= end)
501 break;
502 /*
503 * Set [nstart; nend) to intersection of desired address
504 * range with the first VMA. Also, skip undesirable VMA types.
505 */
506 nend = min(end, vma->vm_end);
507 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
508 continue;
509 if (nstart < vma->vm_start)
510 nstart = vma->vm_start;
511 /*
512 * Now fault in a range of pages within the first VMA.
513 */
514 if (vma->vm_flags & VM_LOCKED) {
515 ret = __mlock_vma_pages_range(vma, nstart, nend);
516 if (ret < 0 && ignore_errors) {
517 ret = 0;
518 continue; /* continue at next VMA */
519 }
520 if (ret) {
521 ret = __mlock_posix_error_return(ret);
522 break;
523 }
524 } else
525 make_pages_present(nstart, nend);
526 }
527 up_read(&mm->mmap_sem);
528 return ret; /* 0 or negative error code */
529}
530
490SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) 531SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
491{ 532{
492 unsigned long locked; 533 unsigned long locked;
@@ -512,6 +553,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
512 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) 553 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
513 error = do_mlock(start, len, 1); 554 error = do_mlock(start, len, 1);
514 up_write(&current->mm->mmap_sem); 555 up_write(&current->mm->mmap_sem);
556 if (!error)
557 error = do_mlock_pages(start, len, 0);
515 return error; 558 return error;
516} 559}
517 560
@@ -576,6 +619,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
576 capable(CAP_IPC_LOCK)) 619 capable(CAP_IPC_LOCK))
577 ret = do_mlockall(flags); 620 ret = do_mlockall(flags);
578 up_write(&current->mm->mmap_sem); 621 up_write(&current->mm->mmap_sem);
622 if (!ret && (flags & MCL_CURRENT)) {
623 /* Ignore errors */
624 do_mlock_pages(0, TASK_SIZE, 1);
625 }
579out: 626out:
580 return ret; 627 return ret;
581} 628}