Manually revert "mlock: downgrade mmap sem while populating mlocked regions"

This essentially reverts commit 8edb08caf68184fb170f4f69c7445929e199eaea. It downgraded our mmap semaphore to a read-lock while mlocking pages, in order to allow other threads (and external accesses like "ps" et al) to walk the vma lists and take page faults etc. Which is a nice idea, but the implementation does not work. Because we cannot upgrade the lock back to a write lock without releasing the mmap semaphore, the code had to release the lock entirely and then re-take it as a writelock. However, that meant that the caller possibly lost the vma chain that it was following, since now another thread could come in and mmap/munmap the range. The code tried to work around that by just looking up the vma again and erroring out if that happened, but quite frankly, that was just a buggy hack that doesn't actually protect against anything (the other thread could just have replaced the vma with another one instead of totally unmapping it). The only way to downgrade to a read map _reliably_ is to do it at the end, which is likely the right thing to do: do all the 'vma' operations with the write-lock held, then downgrade to a read after completing them all, and then do the "populate the newly mlocked regions" while holding just the read lock. And then just drop the read-lock and return to user space. The (perhaps somewhat simpler) alternative is to just make all the callers of mlock_vma_pages_range() know that the mmap lock got dropped, and just re-grab the mmap semaphore if it needs to mlock more than one vma region. So we can do this "downgrade mmap sem while populating mlocked regions" thing right, but the way it was done here was absolutely not correct. Thus the revert, in the expectation that we will do it all correctly some day. Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Rik van Riel <riel@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: stable@kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Linus Torvalds <torvalds@linux-foundation.org> 2009-02-01 14:00:16 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-02-01 14:00:16 -0500
commit: 27421e211a39784694b597dbf35848b88363c248 (patch)
tree: 8d9a06e453c40dea7db3a3d0d9d3d864690427db
parent: 45c82b5a770be66845687a7d027c8b52946d59af (diff)
1 files changed, 2 insertions, 45 deletions
diff --git a/mm/mlock.c b/mm/mlock.c
index 2904a347e476..028ec482fdd4 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -294,14 +294,10 @@ static inline int __mlock_posix_error_return(long retval)
 *
 * return number of pages [> 0] to be removed from locked_vm on success
 * of "special" vmas.
- *
- * return negative error if vma spanning @start-@range disappears while
- * mmap semaphore is dropped.  Unlikely?
 */
 long mlock_vma_pages_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end)
 {
-        struct mm_struct *mm = vma->vm_mm;
        int nr_pages = (end - start) / PAGE_SIZE;
        BUG_ON(!(vma->vm_flags & VM_LOCKED));
@@ -314,20 +310,8 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
        if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
                        is_vm_hugetlb_page(vma) ||
                        vma == get_gate_vma(current))) {
-                long error;
-                downgrade_write(&mm->mmap_sem);
-                error = __mlock_vma_pages_range(vma, start, end, 1);
-                up_read(&mm->mmap_sem);
+                return __mlock_vma_pages_range(vma, start, end, 1);
-                /* vma can change or disappear */
-                down_write(&mm->mmap_sem);
-                vma = find_vma(mm, start);
-                /* non-NULL vma must contain @start, but need to check @end */
-                if (!vma ||  end > vma->vm_end)
-                        return -ENOMEM;
-                return 0;       /* hide other errors from mmap(), et al */
        }
        /*
@@ -438,41 +422,14 @@ success:
        vma->vm_flags = newflags;
        if (lock) {
-                /*
-                 * mmap_sem is currently held for write.  Downgrade the write
-                 * lock to a read lock so that other faults, mmap scans, ...
-                 * while we fault in all pages.
-                 */
-                downgrade_write(&mm->mmap_sem);
                ret = __mlock_vma_pages_range(vma, start, end, 1);
-                /*
+                if (ret > 0) {
-                 * Need to reacquire mmap sem in write mode, as our callers
-                 * expect this.  We have no support for atomically upgrading
-                 * a sem to write, so we need to check for ranges while sem
-                 * is unlocked.
-                 */
-                up_read(&mm->mmap_sem);
-                /* vma can change or disappear */
-                down_write(&mm->mmap_sem);
-                *prev = find_vma(mm, start);
-                /* non-NULL *prev must contain @start, but need to check @end */
-                if (!(*prev) || end > (*prev)->vm_end)
-                        ret = -ENOMEM;
-                else if (ret > 0) {
                        mm->locked_vm -= ret;
                        ret = 0;
                } else
                        ret = __mlock_posix_error_return(ret); /* translate if needed */
        } else {
-                /*
-                 * TODO:  for unlocking, pages will already be resident, so
-                 * we don't need to wait for allocations/reclaim/pagein, ...
-                 * However, unlocking a very large region can still take a
-                 * while.  Should we downgrade the semaphore for both lock
-                 * AND unlock ?
-                 */
                __mlock_vma_pages_range(vma, start, end, 0);
        }
author	Linus Torvalds <torvalds@linux-foundation.org>	2009-02-01 14:00:16 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-02-01 14:00:16 -0500
commit	27421e211a39784694b597dbf35848b88363c248 (patch)
tree	8d9a06e453c40dea7db3a3d0d9d3d864690427db
parent	45c82b5a770be66845687a7d027c8b52946d59af (diff)

diff --git a/mm/mlock.c b/mm/mlock.c index 2904a347e476..028ec482fdd4 100644 --- a/mm/mlock.c +++ b/mm/mlock.c
@@ -294,14 +294,10 @@ static inline int __mlock_posix_error_return(long retval)
294	*	294	*
295	* return number of pages [> 0] to be removed from locked_vm on success	295	* return number of pages [> 0] to be removed from locked_vm on success
296	* of "special" vmas.	296	* of "special" vmas.
297	*
298	* return negative error if vma spanning @start-@range disappears while
299	* mmap semaphore is dropped. Unlikely?
300	*/	297	*/
301	long mlock_vma_pages_range(struct vm_area_struct *vma,	298	long mlock_vma_pages_range(struct vm_area_struct *vma,
302	unsigned long start, unsigned long end)	299	unsigned long start, unsigned long end)
303	{	300	{
304	struct mm_struct *mm = vma->vm_mm;
305	int nr_pages = (end - start) / PAGE_SIZE;	301	int nr_pages = (end - start) / PAGE_SIZE;
306	BUG_ON(!(vma->vm_flags & VM_LOCKED));	302	BUG_ON(!(vma->vm_flags & VM_LOCKED));
307		303
@@ -314,20 +310,8 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
314	if (!((vma->vm_flags & (VM_DONTEXPAND \| VM_RESERVED)) \|\|	310	if (!((vma->vm_flags & (VM_DONTEXPAND \| VM_RESERVED)) \|\|
315	is_vm_hugetlb_page(vma) \|\|	311	is_vm_hugetlb_page(vma) \|\|
316	vma == get_gate_vma(current))) {	312	vma == get_gate_vma(current))) {
317	long error;
318	downgrade_write(&mm->mmap_sem);
319
320	error = __mlock_vma_pages_range(vma, start, end, 1);
321		313
322	up_read(&mm->mmap_sem);	314	return __mlock_vma_pages_range(vma, start, end, 1);
323	/* vma can change or disappear */
324	down_write(&mm->mmap_sem);
325	vma = find_vma(mm, start);
326	/* non-NULL vma must contain @start, but need to check @end */
327	if (!vma \|\| end > vma->vm_end)
328	return -ENOMEM;
329
330	return 0; /* hide other errors from mmap(), et al */
331	}	315	}
332		316
333	/*	317	/*
@@ -438,41 +422,14 @@ success:
438	vma->vm_flags = newflags;	422	vma->vm_flags = newflags;
439		423
440	if (lock) {	424	if (lock) {
441	/*
442	* mmap_sem is currently held for write. Downgrade the write
443	* lock to a read lock so that other faults, mmap scans, ...
444	* while we fault in all pages.
445	*/
446	downgrade_write(&mm->mmap_sem);
447
448	ret = __mlock_vma_pages_range(vma, start, end, 1);	425	ret = __mlock_vma_pages_range(vma, start, end, 1);
449		426
450	/*	427	if (ret > 0) {
451	* Need to reacquire mmap sem in write mode, as our callers
452	* expect this. We have no support for atomically upgrading
453	* a sem to write, so we need to check for ranges while sem
454	* is unlocked.
455	*/
456	up_read(&mm->mmap_sem);
457	/* vma can change or disappear */
458	down_write(&mm->mmap_sem);
459	*prev = find_vma(mm, start);
460	/* non-NULL prev must contain @start, but need to check @end /
461	if (!(prev) \|\| end > (prev)->vm_end)
462	ret = -ENOMEM;
463	else if (ret > 0) {
464	mm->locked_vm -= ret;	428	mm->locked_vm -= ret;
465	ret = 0;	429	ret = 0;
466	} else	430	} else
467	ret = __mlock_posix_error_return(ret); /* translate if needed */	431	ret = __mlock_posix_error_return(ret); /* translate if needed */
468	} else {	432	} else {
469	/*
470	* TODO: for unlocking, pages will already be resident, so
471	* we don't need to wait for allocations/reclaim/pagein, ...
472	* However, unlocking a very large region can still take a
473	* while. Should we downgrade the semaphore for both lock
474	* AND unlock ?
475	*/
476	__mlock_vma_pages_range(vma, start, end, 0);	433	__mlock_vma_pages_range(vma, start, end, 0);
477	}	434	}
478		435