aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLee Schermerhorn <lee.schermerhorn@hp.com>2008-10-18 23:26:49 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-20 11:52:31 -0400
commit8edb08caf68184fb170f4f69c7445929e199eaea (patch)
treec0d8f24971c90e5627207f0f0cb7c06f9bdb5dc4
parentfa07e787733416c42938a310a8e717295934e33c (diff)
mlock: downgrade mmap sem while populating mlocked regions
We need to hold the mmap_sem for write to initiatate mlock()/munlock() because we may need to merge/split vmas. However, this can lead to very long lock hold times attempting to fault in a large memory region to mlock it into memory. This can hold off other faults against the mm [multithreaded tasks] and other scans of the mm, such as via /proc. To alleviate this, downgrade the mmap_sem to read mode during the population of the region for locking. This is especially the case if we need to reclaim memory to lock down the region. We [probably?] don't need to do this for unlocking as all of the pages should be resident--they're already mlocked. Now, the caller's of the mlock functions [mlock_fixup() and mlock_vma_pages_range()] expect the mmap_sem to be returned in write mode. Changing all callers appears to be way too much effort at this point. So, restore write mode before returning. Note that this opens a window where the mmap list could change in a multithreaded process. So, at least for mlock_fixup(), where we could be called in a loop over multiple vmas, we check that a vma still exists at the start address and that vma still covers the page range [start,end). If not, we return an error, -EAGAIN, and let the caller deal with it. Return -EAGAIN from mlock_vma_pages_range() function and mlock_fixup() if the vma at 'start' disappears or changes so that the page range [start,end) is no longer contained in the vma. Again, let the caller deal with it. Looks like only sys_remap_file_pages() [via mmap_region()] should actually care. With this patch, I no longer see processes like ps(1) blocked for seconds or minutes at a time waiting for a large [multiple gigabyte] region to be locked down. However, I occassionally see delays while unlocking or unmapping a large mlocked region. Should we also downgrade the mmap_sem for the unlock path? Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/mlock.c46
1 files changed, 43 insertions, 3 deletions
diff --git a/mm/mlock.c b/mm/mlock.c
index 8746fe3f9730..c83896a72504 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -318,6 +318,7 @@ static void __munlock_vma_pages_range(struct vm_area_struct *vma,
318int mlock_vma_pages_range(struct vm_area_struct *vma, 318int mlock_vma_pages_range(struct vm_area_struct *vma,
319 unsigned long start, unsigned long end) 319 unsigned long start, unsigned long end)
320{ 320{
321 struct mm_struct *mm = vma->vm_mm;
321 int nr_pages = (end - start) / PAGE_SIZE; 322 int nr_pages = (end - start) / PAGE_SIZE;
322 BUG_ON(!(vma->vm_flags & VM_LOCKED)); 323 BUG_ON(!(vma->vm_flags & VM_LOCKED));
323 324
@@ -329,8 +330,19 @@ int mlock_vma_pages_range(struct vm_area_struct *vma,
329 330
330 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || 331 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
331 is_vm_hugetlb_page(vma) || 332 is_vm_hugetlb_page(vma) ||
332 vma == get_gate_vma(current))) 333 vma == get_gate_vma(current))) {
333 return __mlock_vma_pages_range(vma, start, end); 334 downgrade_write(&mm->mmap_sem);
335 nr_pages = __mlock_vma_pages_range(vma, start, end);
336
337 up_read(&mm->mmap_sem);
338 /* vma can change or disappear */
339 down_write(&mm->mmap_sem);
340 vma = find_vma(mm, start);
341 /* non-NULL vma must contain @start, but need to check @end */
342 if (!vma || end > vma->vm_end)
343 return -EAGAIN;
344 return nr_pages;
345 }
334 346
335 /* 347 /*
336 * User mapped kernel pages or huge pages: 348 * User mapped kernel pages or huge pages:
@@ -424,13 +436,41 @@ success:
424 vma->vm_flags = newflags; 436 vma->vm_flags = newflags;
425 437
426 if (lock) { 438 if (lock) {
439 /*
440 * mmap_sem is currently held for write. Downgrade the write
441 * lock to a read lock so that other faults, mmap scans, ...
442 * while we fault in all pages.
443 */
444 downgrade_write(&mm->mmap_sem);
445
427 ret = __mlock_vma_pages_range(vma, start, end); 446 ret = __mlock_vma_pages_range(vma, start, end);
428 if (ret > 0) { 447 if (ret > 0) {
429 mm->locked_vm -= ret; 448 mm->locked_vm -= ret;
430 ret = 0; 449 ret = 0;
431 } 450 }
432 } else 451 /*
452 * Need to reacquire mmap sem in write mode, as our callers
453 * expect this. We have no support for atomically upgrading
454 * a sem to write, so we need to check for ranges while sem
455 * is unlocked.
456 */
457 up_read(&mm->mmap_sem);
458 /* vma can change or disappear */
459 down_write(&mm->mmap_sem);
460 *prev = find_vma(mm, start);
461 /* non-NULL *prev must contain @start, but need to check @end */
462 if (!(*prev) || end > (*prev)->vm_end)
463 ret = -EAGAIN;
464 } else {
465 /*
466 * TODO: for unlocking, pages will already be resident, so
467 * we don't need to wait for allocations/reclaim/pagein, ...
468 * However, unlocking a very large region can still take a
469 * while. Should we downgrade the semaphore for both lock
470 * AND unlock ?
471 */
433 __munlock_vma_pages_range(vma, start, end); 472 __munlock_vma_pages_range(vma, start, end);
473 }
434 474
435out: 475out:
436 *prev = vma; 476 *prev = vma;