diff options
author | Lee Schermerhorn <lee.schermerhorn@hp.com> | 2008-10-18 23:26:49 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-10-20 11:52:31 -0400 |
commit | 8edb08caf68184fb170f4f69c7445929e199eaea (patch) | |
tree | c0d8f24971c90e5627207f0f0cb7c06f9bdb5dc4 /mm/mlock.c | |
parent | fa07e787733416c42938a310a8e717295934e33c (diff) |
mlock: downgrade mmap sem while populating mlocked regions
We need to hold the mmap_sem for write to initiatate mlock()/munlock()
because we may need to merge/split vmas. However, this can lead to very
long lock hold times attempting to fault in a large memory region to mlock
it into memory. This can hold off other faults against the mm
[multithreaded tasks] and other scans of the mm, such as via /proc. To
alleviate this, downgrade the mmap_sem to read mode during the population
of the region for locking. This is especially the case if we need to
reclaim memory to lock down the region. We [probably?] don't need to do
this for unlocking as all of the pages should be resident--they're already
mlocked.
Now, the caller's of the mlock functions [mlock_fixup() and
mlock_vma_pages_range()] expect the mmap_sem to be returned in write mode.
Changing all callers appears to be way too much effort at this point.
So, restore write mode before returning. Note that this opens a window
where the mmap list could change in a multithreaded process. So, at least
for mlock_fixup(), where we could be called in a loop over multiple vmas,
we check that a vma still exists at the start address and that vma still
covers the page range [start,end). If not, we return an error, -EAGAIN,
and let the caller deal with it.
Return -EAGAIN from mlock_vma_pages_range() function and mlock_fixup() if
the vma at 'start' disappears or changes so that the page range
[start,end) is no longer contained in the vma. Again, let the caller deal
with it. Looks like only sys_remap_file_pages() [via mmap_region()]
should actually care.
With this patch, I no longer see processes like ps(1) blocked for seconds
or minutes at a time waiting for a large [multiple gigabyte] region to be
locked down. However, I occassionally see delays while unlocking or
unmapping a large mlocked region. Should we also downgrade the mmap_sem
for the unlock path?
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mlock.c')
-rw-r--r-- | mm/mlock.c | 46 |
1 files changed, 43 insertions, 3 deletions
diff --git a/mm/mlock.c b/mm/mlock.c index 8746fe3f9730..c83896a72504 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -318,6 +318,7 @@ static void __munlock_vma_pages_range(struct vm_area_struct *vma, | |||
318 | int mlock_vma_pages_range(struct vm_area_struct *vma, | 318 | int mlock_vma_pages_range(struct vm_area_struct *vma, |
319 | unsigned long start, unsigned long end) | 319 | unsigned long start, unsigned long end) |
320 | { | 320 | { |
321 | struct mm_struct *mm = vma->vm_mm; | ||
321 | int nr_pages = (end - start) / PAGE_SIZE; | 322 | int nr_pages = (end - start) / PAGE_SIZE; |
322 | BUG_ON(!(vma->vm_flags & VM_LOCKED)); | 323 | BUG_ON(!(vma->vm_flags & VM_LOCKED)); |
323 | 324 | ||
@@ -329,8 +330,19 @@ int mlock_vma_pages_range(struct vm_area_struct *vma, | |||
329 | 330 | ||
330 | if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | 331 | if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || |
331 | is_vm_hugetlb_page(vma) || | 332 | is_vm_hugetlb_page(vma) || |
332 | vma == get_gate_vma(current))) | 333 | vma == get_gate_vma(current))) { |
333 | return __mlock_vma_pages_range(vma, start, end); | 334 | downgrade_write(&mm->mmap_sem); |
335 | nr_pages = __mlock_vma_pages_range(vma, start, end); | ||
336 | |||
337 | up_read(&mm->mmap_sem); | ||
338 | /* vma can change or disappear */ | ||
339 | down_write(&mm->mmap_sem); | ||
340 | vma = find_vma(mm, start); | ||
341 | /* non-NULL vma must contain @start, but need to check @end */ | ||
342 | if (!vma || end > vma->vm_end) | ||
343 | return -EAGAIN; | ||
344 | return nr_pages; | ||
345 | } | ||
334 | 346 | ||
335 | /* | 347 | /* |
336 | * User mapped kernel pages or huge pages: | 348 | * User mapped kernel pages or huge pages: |
@@ -424,13 +436,41 @@ success: | |||
424 | vma->vm_flags = newflags; | 436 | vma->vm_flags = newflags; |
425 | 437 | ||
426 | if (lock) { | 438 | if (lock) { |
439 | /* | ||
440 | * mmap_sem is currently held for write. Downgrade the write | ||
441 | * lock to a read lock so that other faults, mmap scans, ... | ||
442 | * while we fault in all pages. | ||
443 | */ | ||
444 | downgrade_write(&mm->mmap_sem); | ||
445 | |||
427 | ret = __mlock_vma_pages_range(vma, start, end); | 446 | ret = __mlock_vma_pages_range(vma, start, end); |
428 | if (ret > 0) { | 447 | if (ret > 0) { |
429 | mm->locked_vm -= ret; | 448 | mm->locked_vm -= ret; |
430 | ret = 0; | 449 | ret = 0; |
431 | } | 450 | } |
432 | } else | 451 | /* |
452 | * Need to reacquire mmap sem in write mode, as our callers | ||
453 | * expect this. We have no support for atomically upgrading | ||
454 | * a sem to write, so we need to check for ranges while sem | ||
455 | * is unlocked. | ||
456 | */ | ||
457 | up_read(&mm->mmap_sem); | ||
458 | /* vma can change or disappear */ | ||
459 | down_write(&mm->mmap_sem); | ||
460 | *prev = find_vma(mm, start); | ||
461 | /* non-NULL *prev must contain @start, but need to check @end */ | ||
462 | if (!(*prev) || end > (*prev)->vm_end) | ||
463 | ret = -EAGAIN; | ||
464 | } else { | ||
465 | /* | ||
466 | * TODO: for unlocking, pages will already be resident, so | ||
467 | * we don't need to wait for allocations/reclaim/pagein, ... | ||
468 | * However, unlocking a very large region can still take a | ||
469 | * while. Should we downgrade the semaphore for both lock | ||
470 | * AND unlock ? | ||
471 | */ | ||
433 | __munlock_vma_pages_range(vma, start, end); | 472 | __munlock_vma_pages_range(vma, start, end); |
473 | } | ||
434 | 474 | ||
435 | out: | 475 | out: |
436 | *prev = vma; | 476 | *prev = vma; |