mlock: mlocked pages are unevictable

Make sure that mlocked pages also live on the unevictable LRU, so kswapd will not scan them over and over again. This is achieved through various strategies: 1) add yet another page flag--PG_mlocked--to indicate that the page is locked for efficient testing in vmscan and, optionally, fault path. This allows early culling of unevictable pages, preventing them from getting to page_referenced()/try_to_unmap(). Also allows separate accounting of mlock'd pages, as Nick's original patch did. Note: Nick's original mlock patch used a PG_mlocked flag. I had removed this in favor of the PG_unevictable flag + an mlock_count [new page struct member]. I restored the PG_mlocked flag to eliminate the new count field. 2) add the mlock/unevictable infrastructure to mm/mlock.c, with internal APIs in mm/internal.h. This is a rework of Nick's original patch to these files, taking into account that mlocked pages are now kept on unevictable LRU list. 3) update vmscan.c:page_evictable() to check PageMlocked() and, if vma passed in, the vm_flags. Note that the vma will only be passed in for new pages in the fault path; and then only if the "cull unevictable pages in fault path" patch is included. 4) add try_to_unlock() to rmap.c to walk a page's rmap and ClearPageMlocked() if no other vmas have it mlocked. Reuses as much of try_to_unmap() as possible. This effectively replaces the use of one of the lru list links as an mlock count. If this mechanism let's pages in mlocked vmas leak through w/o PG_mlocked set [I don't know that it does], we should catch them later in try_to_unmap(). One hopes this will be rare, as it will be relatively expensive. Original mm/internal.h, mm/rmap.c and mm/mlock.c changes: Signed-off-by: Nick Piggin <npiggin@suse.de> splitlru: introduce __get_user_pages(): New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS. because current get_user_pages() can't grab PROT_NONE pages theresore it cause PROT_NONE pages can't munlock. [akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch] [akpm@linux-foundation.org: untangle patch interdependencies] [akpm@linux-foundation.org: fix things after out-of-order merging] [hugh@veritas.com: fix page-flags mess] [lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm'] [kosaki.motohiro@jp.fujitsu.com: build fix] [kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments] [kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()] Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Matt Mackall <mpm@selenic.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Nick Piggin <npiggin@suse.de> 2008-10-18 23:26:44 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-10-20 11:52:30 -0400
commit: b291f000393f5a0b679012b39d79fbc85c018233 (patch)
tree: 28eb785d4d157d3396e4377294e6054635a4bd90 /mm/memory.c
parent: 89e004ea55abe201b29e2d6e35124101f1288ef7 (diff)
1 files changed, 49 insertions, 7 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 71cdefd1ef1..9fef7272fb9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -64,6 +64,8 @@
 #include "internal.h"
+#include "internal.h"
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
@@ -1129,12 +1131,17 @@ static inline int use_zero_page(struct vm_area_struct *vma)
        return !vma->vm_ops || !vma->vm_ops->fault;
 }
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                unsigned long start, int len, int write, int force,
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                     unsigned long start, int len, int flags,
                struct page **pages, struct vm_area_struct **vmas)
 {
        int i;
-        unsigned int vm_flags;
+        unsigned int vm_flags = 0;
+        int write = !!(flags & GUP_FLAGS_WRITE);
+        int force = !!(flags & GUP_FLAGS_FORCE);
+        int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
        if (len <= 0)
                return 0;
@@ -1158,7 +1165,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        pud_t *pud;
                        pmd_t *pmd;
                        pte_t *pte;
-                        if (write) /* user gate pages are read-only */
+                        /* user gate pages are read-only */
+                        if (!ignore && write)
                                return i ? : -EFAULT;
                        if (pg > TASK_SIZE)
                                pgd = pgd_offset_k(pg);
@@ -1190,8 +1199,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        continue;
                }
-                if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
+                if (!vma ||
-                                || !(vm_flags & vma->vm_flags))
+                    (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+                    (!ignore && !(vm_flags & vma->vm_flags)))
                        return i ? : -EFAULT;
                if (is_vm_hugetlb_page(vma)) {
@@ -1266,6 +1276,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        } while (len);
        return i;
 }
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                unsigned long start, int len, int write, int force,
+                struct page **pages, struct vm_area_struct **vmas)
+{
+        int flags = 0;
+        if (write)
+                flags |= GUP_FLAGS_WRITE;
+        if (force)
+                flags |= GUP_FLAGS_FORCE;
+        return __get_user_pages(tsk, mm,
+                                start, len, flags,
+                                pages, vmas);
+}
 EXPORT_SYMBOL(get_user_pages);
 pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@ -1858,6 +1885,15 @@ gotten:
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
        if (!new_page)
                goto oom;
+        /*
+         * Don't let another task, with possibly unlocked vma,
+         * keep the mlocked page.
+         */
+        if (vma->vm_flags & VM_LOCKED) {
+                lock_page(old_page);    /* for LRU manipulation */
+                clear_page_mlock(old_page);
+                unlock_page(old_page);
+        }
        cow_user_page(new_page, old_page, address, vma);
        __SetPageUptodate(new_page);
@@ -2325,7 +2361,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        page_add_anon_rmap(page, vma, address);
        swap_free(entry);
-        if (vm_swap_full())
+        if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
                remove_exclusive_swap_page(page);
        unlock_page(page);
@@ -2465,6 +2501,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                ret = VM_FAULT_OOM;
                                goto out;
                        }
+                        /*
+                         * Don't let another task, with possibly unlocked vma,
+                         * keep the mlocked page.
+                         */
+                        if (vma->vm_flags & VM_LOCKED)
+                                clear_page_mlock(vmf.page);
                        copy_user_highpage(page, vmf.page, address, vma);
                        __SetPageUptodate(page);
                } else {
author	Nick Piggin <npiggin@suse.de>	2008-10-18 23:26:44 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-10-20 11:52:30 -0400
commit	b291f000393f5a0b679012b39d79fbc85c018233 (patch)
tree	28eb785d4d157d3396e4377294e6054635a4bd90 /mm/memory.c
parent	89e004ea55abe201b29e2d6e35124101f1288ef7 (diff)

diff --git a/mm/memory.c b/mm/memory.c index 71cdefd1ef1..9fef7272fb9 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -64,6 +64,8 @@
64		64
65	#include "internal.h"	65	#include "internal.h"
66		66
		67	#include "internal.h"
		68
67	#ifndef CONFIG_NEED_MULTIPLE_NODES	69	#ifndef CONFIG_NEED_MULTIPLE_NODES
68	/* use the per-pgdat data instead for discontigmem - mbligh */	70	/* use the per-pgdat data instead for discontigmem - mbligh */
69	unsigned long max_mapnr;	71	unsigned long max_mapnr;
@@ -1129,12 +1131,17 @@ static inline int use_zero_page(struct vm_area_struct *vma)
1129	return !vma->vm_ops \|\| !vma->vm_ops->fault;	1131	return !vma->vm_ops \|\| !vma->vm_ops->fault;
1130	}	1132	}
1131		1133
1132	int get_user_pages(struct task_struct tsk, struct mm_struct mm,	1134
1133	unsigned long start, int len, int write, int force,	1135
		1136	int __get_user_pages(struct task_struct tsk, struct mm_struct mm,
		1137	unsigned long start, int len, int flags,
1134	struct page pages, struct vm_area_struct vmas)	1138	struct page pages, struct vm_area_struct vmas)
1135	{	1139	{
1136	int i;	1140	int i;
1137	unsigned int vm_flags;	1141	unsigned int vm_flags = 0;
		1142	int write = !!(flags & GUP_FLAGS_WRITE);
		1143	int force = !!(flags & GUP_FLAGS_FORCE);
		1144	int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1138		1145
1139	if (len <= 0)	1146	if (len <= 0)
1140	return 0;	1147	return 0;
@@ -1158,7 +1165,9 @@ int get_user_pages(struct task_struct tsk, struct mm_struct mm,
1158	pud_t *pud;	1165	pud_t *pud;
1159	pmd_t *pmd;	1166	pmd_t *pmd;
1160	pte_t *pte;	1167	pte_t *pte;
1161	if (write) /* user gate pages are read-only */	1168
		1169	/* user gate pages are read-only */
		1170	if (!ignore && write)
1162	return i ? : -EFAULT;	1171	return i ? : -EFAULT;
1163	if (pg > TASK_SIZE)	1172	if (pg > TASK_SIZE)
1164	pgd = pgd_offset_k(pg);	1173	pgd = pgd_offset_k(pg);
@@ -1190,8 +1199,9 @@ int get_user_pages(struct task_struct tsk, struct mm_struct mm,
1190	continue;	1199	continue;
1191	}	1200	}
1192		1201
1193	if (!vma \|\| (vma->vm_flags & (VM_IO \| VM_PFNMAP))	1202	if (!vma \|\|
1194	\|\| !(vm_flags & vma->vm_flags))	1203	(vma->vm_flags & (VM_IO \| VM_PFNMAP)) \|\|
		1204	(!ignore && !(vm_flags & vma->vm_flags)))
1195	return i ? : -EFAULT;	1205	return i ? : -EFAULT;
1196		1206
1197	if (is_vm_hugetlb_page(vma)) {	1207	if (is_vm_hugetlb_page(vma)) {
@@ -1266,6 +1276,23 @@ int get_user_pages(struct task_struct tsk, struct mm_struct mm,
1266	} while (len);	1276	} while (len);
1267	return i;	1277	return i;
1268	}	1278	}
		1279
		1280	int get_user_pages(struct task_struct tsk, struct mm_struct mm,
		1281	unsigned long start, int len, int write, int force,
		1282	struct page pages, struct vm_area_struct vmas)
		1283	{
		1284	int flags = 0;
		1285
		1286	if (write)
		1287	flags \|= GUP_FLAGS_WRITE;
		1288	if (force)
		1289	flags \|= GUP_FLAGS_FORCE;
		1290
		1291	return __get_user_pages(tsk, mm,
		1292	start, len, flags,
		1293	pages, vmas);
		1294	}
		1295
1269	EXPORT_SYMBOL(get_user_pages);	1296	EXPORT_SYMBOL(get_user_pages);
1270		1297
1271	pte_t get_locked_pte(struct mm_struct mm, unsigned long addr,	1298	pte_t get_locked_pte(struct mm_struct mm, unsigned long addr,
@@ -1858,6 +1885,15 @@ gotten:
1858	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);	1885	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1859	if (!new_page)	1886	if (!new_page)
1860	goto oom;	1887	goto oom;
		1888	/*
		1889	* Don't let another task, with possibly unlocked vma,
		1890	* keep the mlocked page.
		1891	*/
		1892	if (vma->vm_flags & VM_LOCKED) {
		1893	lock_page(old_page); /* for LRU manipulation */
		1894	clear_page_mlock(old_page);
		1895	unlock_page(old_page);
		1896	}
1861	cow_user_page(new_page, old_page, address, vma);	1897	cow_user_page(new_page, old_page, address, vma);
1862	__SetPageUptodate(new_page);	1898	__SetPageUptodate(new_page);
1863		1899
@@ -2325,7 +2361,7 @@ static int do_swap_page(struct mm_struct mm, struct vm_area_struct vma,
2325	page_add_anon_rmap(page, vma, address);	2361	page_add_anon_rmap(page, vma, address);
2326		2362
2327	swap_free(entry);	2363	swap_free(entry);
2328	if (vm_swap_full())	2364	if (vm_swap_full() \|\| (vma->vm_flags & VM_LOCKED) \|\| PageMlocked(page))
2329	remove_exclusive_swap_page(page);	2365	remove_exclusive_swap_page(page);
2330	unlock_page(page);	2366	unlock_page(page);
2331		2367
@@ -2465,6 +2501,12 @@ static int __do_fault(struct mm_struct mm, struct vm_area_struct vma,
2465	ret = VM_FAULT_OOM;	2501	ret = VM_FAULT_OOM;
2466	goto out;	2502	goto out;
2467	}	2503	}
		2504	/*
		2505	* Don't let another task, with possibly unlocked vma,
		2506	* keep the mlocked page.
		2507	*/
		2508	if (vma->vm_flags & VM_LOCKED)
		2509	clear_page_mlock(vmf.page);
2468	copy_user_highpage(page, vmf.page, address, vma);	2510	copy_user_highpage(page, vmf.page, address, vma);
2469	__SetPageUptodate(page);	2511	__SetPageUptodate(page);
2470	} else {	2512	} else {