mlock: downgrade mmap sem while populating mlocked regions

We need to hold the mmap_sem for write to initiatate mlock()/munlock() because we may need to merge/split vmas. However, this can lead to very long lock hold times attempting to fault in a large memory region to mlock it into memory. This can hold off other faults against the mm [multithreaded tasks] and other scans of the mm, such as via /proc. To alleviate this, downgrade the mmap_sem to read mode during the population of the region for locking. This is especially the case if we need to reclaim memory to lock down the region. We [probably?] don't need to do this for unlocking as all of the pages should be resident--they're already mlocked. Now, the caller's of the mlock functions [mlock_fixup() and mlock_vma_pages_range()] expect the mmap_sem to be returned in write mode. Changing all callers appears to be way too much effort at this point. So, restore write mode before returning. Note that this opens a window where the mmap list could change in a multithreaded process. So, at least for mlock_fixup(), where we could be called in a loop over multiple vmas, we check that a vma still exists at the start address and that vma still covers the page range [start,end). If not, we return an error, -EAGAIN, and let the caller deal with it. Return -EAGAIN from mlock_vma_pages_range() function and mlock_fixup() if the vma at 'start' disappears or changes so that the page range [start,end) is no longer contained in the vma. Again, let the caller deal with it. Looks like only sys_remap_file_pages() [via mmap_region()] should actually care. With this patch, I no longer see processes like ps(1) blocked for seconds or minutes at a time waiting for a large [multiple gigabyte] region to be locked down. However, I occassionally see delays while unlocking or unmapping a large mlocked region. Should we also downgrade the mmap_sem for the unlock path? Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Lee Schermerhorn <lee.schermerhorn@hp.com> 2008-10-18 23:26:49 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-10-20 11:52:31 -0400
commit: 8edb08caf68184fb170f4f69c7445929e199eaea (patch)
tree: c0d8f24971c90e5627207f0f0cb7c06f9bdb5dc4 /mm/mlock.c
parent: fa07e787733416c42938a310a8e717295934e33c (diff)
1 files changed, 43 insertions, 3 deletions
diff --git a/mm/mlock.c b/mm/mlock.c
index 8746fe3f9730..c83896a72504 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -318,6 +318,7 @@ static void __munlock_vma_pages_range(struct vm_area_struct *vma,
 int mlock_vma_pages_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end)
 {
+        struct mm_struct *mm = vma->vm_mm;
        int nr_pages = (end - start) / PAGE_SIZE;
        BUG_ON(!(vma->vm_flags & VM_LOCKED));
@@ -329,8 +330,19 @@ int mlock_vma_pages_range(struct vm_area_struct *vma,
        if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
                        is_vm_hugetlb_page(vma) ||
-                        vma == get_gate_vma(current)))
+                        vma == get_gate_vma(current))) {
-                return __mlock_vma_pages_range(vma, start, end);
+                downgrade_write(&mm->mmap_sem);
+                nr_pages = __mlock_vma_pages_range(vma, start, end);
+                up_read(&mm->mmap_sem);
+                /* vma can change or disappear */
+                down_write(&mm->mmap_sem);
+                vma = find_vma(mm, start);
+                /* non-NULL vma must contain @start, but need to check @end */
+                if (!vma ||  end > vma->vm_end)
+                        return -EAGAIN;
+                return nr_pages;
+        }
        /*
         * User mapped kernel pages or huge pages:
@@ -424,13 +436,41 @@ success:
        vma->vm_flags = newflags;
        if (lock) {
+                /*
+                 * mmap_sem is currently held for write.  Downgrade the write
+                 * lock to a read lock so that other faults, mmap scans, ...
+                 * while we fault in all pages.
+                 */
+                downgrade_write(&mm->mmap_sem);
                ret = __mlock_vma_pages_range(vma, start, end);
                if (ret > 0) {
                        mm->locked_vm -= ret;
                        ret = 0;
                }
-        } else
+                /*
+                 * Need to reacquire mmap sem in write mode, as our callers
+                 * expect this.  We have no support for atomically upgrading
+                 * a sem to write, so we need to check for ranges while sem
+                 * is unlocked.
+                 */
+                up_read(&mm->mmap_sem);
+                /* vma can change or disappear */
+                down_write(&mm->mmap_sem);
+                *prev = find_vma(mm, start);
+                /* non-NULL *prev must contain @start, but need to check @end */
+                if (!(*prev) || end > (*prev)->vm_end)
+                        ret = -EAGAIN;
+        } else {
+                /*
+                 * TODO:  for unlocking, pages will already be resident, so
+                 * we don't need to wait for allocations/reclaim/pagein, ...
+                 * However, unlocking a very large region can still take a
+                 * while.  Should we downgrade the semaphore for both lock
+                 * AND unlock ?
+                 */
                __munlock_vma_pages_range(vma, start, end);
+        }
 out:
        *prev = vma;
author	Lee Schermerhorn <lee.schermerhorn@hp.com>	2008-10-18 23:26:49 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-10-20 11:52:31 -0400
commit	8edb08caf68184fb170f4f69c7445929e199eaea (patch)
tree	c0d8f24971c90e5627207f0f0cb7c06f9bdb5dc4 /mm/mlock.c
parent	fa07e787733416c42938a310a8e717295934e33c (diff)

diff --git a/mm/mlock.c b/mm/mlock.c index 8746fe3f9730..c83896a72504 100644 --- a/mm/mlock.c +++ b/mm/mlock.c
@@ -318,6 +318,7 @@ static void __munlock_vma_pages_range(struct vm_area_struct *vma,
318	int mlock_vma_pages_range(struct vm_area_struct *vma,	318	int mlock_vma_pages_range(struct vm_area_struct *vma,
319	unsigned long start, unsigned long end)	319	unsigned long start, unsigned long end)
320	{	320	{
		321	struct mm_struct *mm = vma->vm_mm;
321	int nr_pages = (end - start) / PAGE_SIZE;	322	int nr_pages = (end - start) / PAGE_SIZE;
322	BUG_ON(!(vma->vm_flags & VM_LOCKED));	323	BUG_ON(!(vma->vm_flags & VM_LOCKED));
323		324
@@ -329,8 +330,19 @@ int mlock_vma_pages_range(struct vm_area_struct *vma,
329		330
330	if (!((vma->vm_flags & (VM_DONTEXPAND \| VM_RESERVED)) \|\|	331	if (!((vma->vm_flags & (VM_DONTEXPAND \| VM_RESERVED)) \|\|
331	is_vm_hugetlb_page(vma) \|\|	332	is_vm_hugetlb_page(vma) \|\|
332	vma == get_gate_vma(current)))	333	vma == get_gate_vma(current))) {
333	return __mlock_vma_pages_range(vma, start, end);	334	downgrade_write(&mm->mmap_sem);
		335	nr_pages = __mlock_vma_pages_range(vma, start, end);
		336
		337	up_read(&mm->mmap_sem);
		338	/* vma can change or disappear */
		339	down_write(&mm->mmap_sem);
		340	vma = find_vma(mm, start);
		341	/* non-NULL vma must contain @start, but need to check @end */
		342	if (!vma \|\| end > vma->vm_end)
		343	return -EAGAIN;
		344	return nr_pages;
		345	}
334		346
335	/*	347	/*
336	* User mapped kernel pages or huge pages:	348	* User mapped kernel pages or huge pages:
@@ -424,13 +436,41 @@ success:
424	vma->vm_flags = newflags;	436	vma->vm_flags = newflags;
425		437
426	if (lock) {	438	if (lock) {
		439	/*
		440	* mmap_sem is currently held for write. Downgrade the write
		441	* lock to a read lock so that other faults, mmap scans, ...
		442	* while we fault in all pages.
		443	*/
		444	downgrade_write(&mm->mmap_sem);
		445
427	ret = __mlock_vma_pages_range(vma, start, end);	446	ret = __mlock_vma_pages_range(vma, start, end);
428	if (ret > 0) {	447	if (ret > 0) {
429	mm->locked_vm -= ret;	448	mm->locked_vm -= ret;
430	ret = 0;	449	ret = 0;
431	}	450	}
432	} else	451	/*
		452	* Need to reacquire mmap sem in write mode, as our callers
		453	* expect this. We have no support for atomically upgrading
		454	* a sem to write, so we need to check for ranges while sem
		455	* is unlocked.
		456	*/
		457	up_read(&mm->mmap_sem);
		458	/* vma can change or disappear */
		459	down_write(&mm->mmap_sem);
		460	*prev = find_vma(mm, start);
		461	/* non-NULL prev must contain @start, but need to check @end /
		462	if (!(prev) \|\| end > (prev)->vm_end)
		463	ret = -EAGAIN;
		464	} else {
		465	/*
		466	* TODO: for unlocking, pages will already be resident, so
		467	* we don't need to wait for allocations/reclaim/pagein, ...
		468	* However, unlocking a very large region can still take a
		469	* while. Should we downgrade the semaphore for both lock
		470	* AND unlock ?
		471	*/
433	__munlock_vma_pages_range(vma, start, end);	472	__munlock_vma_pages_range(vma, start, end);
		473	}
434		474
435	out:	475	out:
436	*prev = vma;	476	*prev = vma;