mm: munlock use follow_page

Hiroaki Wakabayashi points out that when mlock() has been interrupted by SIGKILL, the subsequent munlock() takes unnecessarily long because its use of __get_user_pages() insists on faulting in all the pages which mlock() never reached. It's worse than slowness if mlock() is terminated by Out Of Memory kill: the munlock_vma_pages_all() in exit_mmap() insists on faulting in all the pages which mlock() could not find memory for; so innocent bystanders are killed too, and perhaps the system hangs. __get_user_pages() does a lot that's silly for munlock(): so remove the munlock option from __mlock_vma_pages_range(), and use a simple loop of follow_page()s in munlock_vma_pages_range() instead; ignoring absent pages, and not marking present pages as accessed or dirty. (Change munlock() to only go so far as mlock() reached? That does not work out, given the convention that mlock() claims complete success even when it has to give up early - in part so that an underlying file can be extended later, and those pages locked which earlier would give SIGBUS.) Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: <stable@kernel.org> Acked-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Hiroaki Wakabayashi <primulaelatior@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Hugh Dickins <hugh.dickins@tiscali.co.uk> 2009-09-21 20:03:23 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-09-22 10:17:40 -0400
commit: 408e82b78bcc9f1b47c76e833c3df97f675947de (patch)
tree: 393fb1c659f3fe23928c722d3cffcac3429c1aee /mm/mlock.c
parent: 5d3bc2709114b416cab588c577e02c2470e40a6c (diff)
1 files changed, 40 insertions, 59 deletions
diff --git a/mm/mlock.c b/mm/mlock.c
index 45eb650b9654..e13918d4fc4f 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -139,49 +139,36 @@ static void munlock_vma_page(struct page *page)
 }
 /**
- * __mlock_vma_pages_range() -  mlock/munlock a range of pages in the vma.
+ * __mlock_vma_pages_range() -  mlock a range of pages in the vma.
 * @vma:   target vma
 * @start: start address
 * @end:   end address
- * @mlock: 0 indicate munlock, otherwise mlock.
 *
- * If @mlock == 0, unlock an mlocked range;
+ * This takes care of making the pages present too.
- * else mlock the range of pages.  This takes care of making the pages present ,
- * too.
 *
 * return 0 on success, negative error code on error.
 *
 * vma->vm_mm->mmap_sem must be held for at least read.
 */
 static long __mlock_vma_pages_range(struct vm_area_struct *vma,
-                                   unsigned long start, unsigned long end,
+                                    unsigned long start, unsigned long end)
-                                   int mlock)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = start;
        struct page *pages[16]; /* 16 gives a reasonable batch */
        int nr_pages = (end - start) / PAGE_SIZE;
        int ret = 0;
-        int gup_flags = 0;
+        int gup_flags;
        VM_BUG_ON(start & ~PAGE_MASK);
        VM_BUG_ON(end   & ~PAGE_MASK);
        VM_BUG_ON(start < vma->vm_start);
        VM_BUG_ON(end   > vma->vm_end);
-        VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
+        VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
-                  (atomic_read(&mm->mm_users) != 0));
-        /*
-         * mlock:   don't page populate if vma has PROT_NONE permission.
-         * munlock: always do munlock although the vma has PROT_NONE
-         *          permission, or SIGKILL is pending.
-         */
-        if (!mlock)
-                gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
-                             GUP_FLAGS_IGNORE_SIGKILL;
+        gup_flags = 0;
        if (vma->vm_flags & VM_WRITE)
-                gup_flags |= GUP_FLAGS_WRITE;
+                gup_flags = GUP_FLAGS_WRITE;
        while (nr_pages > 0) {
                int i;
@@ -201,19 +188,10 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
                 * This can happen for, e.g., VM_NONLINEAR regions before
                 * a page has been allocated and mapped at a given offset,
                 * or for addresses that map beyond end of a file.
-                 * We'll mlock the the pages if/when they get faulted in.
+                 * We'll mlock the pages if/when they get faulted in.
                 */
                if (ret < 0)
                        break;
-                if (ret == 0) {
-                        /*
-                         * We know the vma is there, so the only time
-                         * we cannot get a single page should be an
-                         * error (ret < 0) case.
-                         */
-                        WARN_ON(1);
-                        break;
-                }
                lru_add_drain();        /* push cached pages to LRU */
@@ -224,28 +202,22 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
                        /*
                         * Because we lock page here and migration is blocked
                         * by the elevated reference, we need only check for
-                         * page truncation (file-cache only).
+                         * file-cache page truncation.  This page->mapping
+                         * check also neatly skips over the ZERO_PAGE(),
+                         * though if that's common we'd prefer not to lock it.
                         */
-                        if (page->mapping) {
+                        if (page->mapping)
-                                if (mlock)
+                                mlock_vma_page(page);
-                                        mlock_vma_page(page);
-                                else
-                                        munlock_vma_page(page);
-                        }
                        unlock_page(page);
-                        put_page(page);         /* ref from get_user_pages() */
+                        put_page(page); /* ref from get_user_pages() */
-                        /*
-                         * here we assume that get_user_pages() has given us
-                         * a list of virtually contiguous pages.
-                         */
-                        addr += PAGE_SIZE;      /* for next get_user_pages() */
-                        nr_pages--;
                }
+                addr += ret * PAGE_SIZE;
+                nr_pages -= ret;
                ret = 0;
        }
-        return ret;     /* count entire vma as locked_vm */
+        return ret;     /* 0 or negative error code */
 }
 /*
@@ -289,7 +261,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
                        is_vm_hugetlb_page(vma) ||
                        vma == get_gate_vma(current))) {
-                __mlock_vma_pages_range(vma, start, end, 1);
+                __mlock_vma_pages_range(vma, start, end);
                /* Hide errors from mmap() and other callers */
                return 0;
@@ -310,7 +282,6 @@ no_mlock:
        return nr_pages;                /* error or pages NOT mlocked */
 }
 /*
 * munlock_vma_pages_range() - munlock all pages in the vma range.'
 * @vma - vma containing range to be munlock()ed.
@@ -330,10 +301,24 @@ no_mlock:
 * free them.  This will result in freeing mlocked pages.
 */
 void munlock_vma_pages_range(struct vm_area_struct *vma,
-                           unsigned long start, unsigned long end)
+                             unsigned long start, unsigned long end)
 {
+        unsigned long addr;
+        lru_add_drain();
        vma->vm_flags &= ~VM_LOCKED;
-        __mlock_vma_pages_range(vma, start, end, 0);
+        for (addr = start; addr < end; addr += PAGE_SIZE) {
+                struct page *page = follow_page(vma, addr, FOLL_GET);
+                if (page) {
+                        lock_page(page);
+                        if (page->mapping)
+                                munlock_vma_page(page);
+                        unlock_page(page);
+                        put_page(page);
+                }
+                cond_resched();
+        }
 }
 /*
@@ -400,18 +385,14 @@ success:
         * It's okay if try_to_unmap_one unmaps a page just after we
         * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
         */
-        vma->vm_flags = newflags;
        if (lock) {
-                ret = __mlock_vma_pages_range(vma, start, end, 1);
+                vma->vm_flags = newflags;
+                ret = __mlock_vma_pages_range(vma, start, end);
-                if (ret > 0) {
+                if (ret < 0)
-                        mm->locked_vm -= ret;
+                        ret = __mlock_posix_error_return(ret);
-                        ret = 0;
-                } else
-                        ret = __mlock_posix_error_return(ret); /* translate if needed */
        } else {
-                __mlock_vma_pages_range(vma, start, end, 0);
+                munlock_vma_pages_range(vma, start, end);
        }
 out:
author	Hugh Dickins <hugh.dickins@tiscali.co.uk>	2009-09-21 20:03:23 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-09-22 10:17:40 -0400
commit	408e82b78bcc9f1b47c76e833c3df97f675947de (patch)
tree	393fb1c659f3fe23928c722d3cffcac3429c1aee /mm/mlock.c
parent	5d3bc2709114b416cab588c577e02c2470e40a6c (diff)

diff --git a/mm/mlock.c b/mm/mlock.c index 45eb650b9654..e13918d4fc4f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c
@@ -139,49 +139,36 @@ static void munlock_vma_page(struct page *page)
139	}	139	}
140		140
141	/**	141	/**
142	* __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma.	142	* __mlock_vma_pages_range() - mlock a range of pages in the vma.
143	* @vma: target vma	143	* @vma: target vma
144	* @start: start address	144	* @start: start address
145	* @end: end address	145	* @end: end address
146	* @mlock: 0 indicate munlock, otherwise mlock.
147	*	146	*
148	* If @mlock == 0, unlock an mlocked range;	147	* This takes care of making the pages present too.
149	* else mlock the range of pages. This takes care of making the pages present ,
150	* too.
151	*	148	*
152	* return 0 on success, negative error code on error.	149	* return 0 on success, negative error code on error.
153	*	150	*
154	* vma->vm_mm->mmap_sem must be held for at least read.	151	* vma->vm_mm->mmap_sem must be held for at least read.
155	*/	152	*/
156	static long __mlock_vma_pages_range(struct vm_area_struct *vma,	153	static long __mlock_vma_pages_range(struct vm_area_struct *vma,
157	unsigned long start, unsigned long end,	154	unsigned long start, unsigned long end)
158	int mlock)
159	{	155	{
160	struct mm_struct *mm = vma->vm_mm;	156	struct mm_struct *mm = vma->vm_mm;
161	unsigned long addr = start;	157	unsigned long addr = start;
162	struct page pages[16]; / 16 gives a reasonable batch */	158	struct page pages[16]; / 16 gives a reasonable batch */
163	int nr_pages = (end - start) / PAGE_SIZE;	159	int nr_pages = (end - start) / PAGE_SIZE;
164	int ret = 0;	160	int ret = 0;
165	int gup_flags = 0;	161	int gup_flags;
166		162
167	VM_BUG_ON(start & ~PAGE_MASK);	163	VM_BUG_ON(start & ~PAGE_MASK);
168	VM_BUG_ON(end & ~PAGE_MASK);	164	VM_BUG_ON(end & ~PAGE_MASK);
169	VM_BUG_ON(start < vma->vm_start);	165	VM_BUG_ON(start < vma->vm_start);
170	VM_BUG_ON(end > vma->vm_end);	166	VM_BUG_ON(end > vma->vm_end);
171	VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&	167	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
172	(atomic_read(&mm->mm_users) != 0));
173
174	/*
175	* mlock: don't page populate if vma has PROT_NONE permission.
176	* munlock: always do munlock although the vma has PROT_NONE
177	* permission, or SIGKILL is pending.
178	*/
179	if (!mlock)
180	gup_flags \|= GUP_FLAGS_IGNORE_VMA_PERMISSIONS \|
181	GUP_FLAGS_IGNORE_SIGKILL;
182		168
		169	gup_flags = 0;
183	if (vma->vm_flags & VM_WRITE)	170	if (vma->vm_flags & VM_WRITE)
184	gup_flags \|= GUP_FLAGS_WRITE;	171	gup_flags = GUP_FLAGS_WRITE;
185		172
186	while (nr_pages > 0) {	173	while (nr_pages > 0) {
187	int i;	174	int i;
@@ -201,19 +188,10 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
201	* This can happen for, e.g., VM_NONLINEAR regions before	188	* This can happen for, e.g., VM_NONLINEAR regions before
202	* a page has been allocated and mapped at a given offset,	189	* a page has been allocated and mapped at a given offset,
203	* or for addresses that map beyond end of a file.	190	* or for addresses that map beyond end of a file.
204	* We'll mlock the the pages if/when they get faulted in.	191	* We'll mlock the pages if/when they get faulted in.
205	*/	192	*/
206	if (ret < 0)	193	if (ret < 0)
207	break;	194	break;
208	if (ret == 0) {
209	/*
210	* We know the vma is there, so the only time
211	* we cannot get a single page should be an
212	* error (ret < 0) case.
213	*/
214	WARN_ON(1);
215	break;
216	}
217		195
218	lru_add_drain(); /* push cached pages to LRU */	196	lru_add_drain(); /* push cached pages to LRU */
219		197
@@ -224,28 +202,22 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
224	/*	202	/*
225	* Because we lock page here and migration is blocked	203	* Because we lock page here and migration is blocked
226	* by the elevated reference, we need only check for	204	* by the elevated reference, we need only check for
227	* page truncation (file-cache only).	205	* file-cache page truncation. This page->mapping
		206	* check also neatly skips over the ZERO_PAGE(),
		207	* though if that's common we'd prefer not to lock it.
228	*/	208	*/
229	if (page->mapping) {	209	if (page->mapping)
230	if (mlock)	210	mlock_vma_page(page);
231	mlock_vma_page(page);
232	else
233	munlock_vma_page(page);
234	}
235	unlock_page(page);	211	unlock_page(page);
236	put_page(page); /* ref from get_user_pages() */	212	put_page(page); /* ref from get_user_pages() */
237
238	/*
239	* here we assume that get_user_pages() has given us
240	* a list of virtually contiguous pages.
241	*/
242	addr += PAGE_SIZE; /* for next get_user_pages() */
243	nr_pages--;
244	}	213	}
		214
		215	addr += ret * PAGE_SIZE;
		216	nr_pages -= ret;
245	ret = 0;	217	ret = 0;
246	}	218	}
247		219
248	return ret; /* count entire vma as locked_vm */	220	return ret; /* 0 or negative error code */
249	}	221	}
250		222
251	/*	223	/*
@@ -289,7 +261,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
289	is_vm_hugetlb_page(vma) \|\|	261	is_vm_hugetlb_page(vma) \|\|
290	vma == get_gate_vma(current))) {	262	vma == get_gate_vma(current))) {
291		263
292	__mlock_vma_pages_range(vma, start, end, 1);	264	__mlock_vma_pages_range(vma, start, end);
293		265
294	/* Hide errors from mmap() and other callers */	266	/* Hide errors from mmap() and other callers */
295	return 0;	267	return 0;
@@ -310,7 +282,6 @@ no_mlock:
310	return nr_pages; /* error or pages NOT mlocked */	282	return nr_pages; /* error or pages NOT mlocked */
311	}	283	}
312		284
313
314	/*	285	/*
315	* munlock_vma_pages_range() - munlock all pages in the vma range.'	286	* munlock_vma_pages_range() - munlock all pages in the vma range.'
316	* @vma - vma containing range to be munlock()ed.	287	* @vma - vma containing range to be munlock()ed.
@@ -330,10 +301,24 @@ no_mlock:
330	* free them. This will result in freeing mlocked pages.	301	* free them. This will result in freeing mlocked pages.
331	*/	302	*/
332	void munlock_vma_pages_range(struct vm_area_struct *vma,	303	void munlock_vma_pages_range(struct vm_area_struct *vma,
333	unsigned long start, unsigned long end)	304	unsigned long start, unsigned long end)
334	{	305	{
		306	unsigned long addr;
		307
		308	lru_add_drain();
335	vma->vm_flags &= ~VM_LOCKED;	309	vma->vm_flags &= ~VM_LOCKED;
336	__mlock_vma_pages_range(vma, start, end, 0);	310
		311	for (addr = start; addr < end; addr += PAGE_SIZE) {
		312	struct page *page = follow_page(vma, addr, FOLL_GET);
		313	if (page) {
		314	lock_page(page);
		315	if (page->mapping)
		316	munlock_vma_page(page);
		317	unlock_page(page);
		318	put_page(page);
		319	}
		320	cond_resched();
		321	}
337	}	322	}
338		323
339	/*	324	/*
@@ -400,18 +385,14 @@ success:
400	* It's okay if try_to_unmap_one unmaps a page just after we	385	* It's okay if try_to_unmap_one unmaps a page just after we
401	* set VM_LOCKED, __mlock_vma_pages_range will bring it back.	386	* set VM_LOCKED, __mlock_vma_pages_range will bring it back.
402	*/	387	*/
403	vma->vm_flags = newflags;
404		388
405	if (lock) {	389	if (lock) {
406	ret = __mlock_vma_pages_range(vma, start, end, 1);	390	vma->vm_flags = newflags;
407		391	ret = __mlock_vma_pages_range(vma, start, end);
408	if (ret > 0) {	392	if (ret < 0)
409	mm->locked_vm -= ret;	393	ret = __mlock_posix_error_return(ret);
410	ret = 0;
411	} else
412	ret = __mlock_posix_error_return(ret); /* translate if needed */
413	} else {	394	} else {
414	__mlock_vma_pages_range(vma, start, end, 0);	395	munlock_vma_pages_range(vma, start, end);
415	}	396	}
416		397
417	out:	398	out: