aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2008-10-18 23:26:44 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-20 11:52:30 -0400
commitb291f000393f5a0b679012b39d79fbc85c018233 (patch)
tree28eb785d4d157d3396e4377294e6054635a4bd90
parent89e004ea55abe201b29e2d6e35124101f1288ef7 (diff)
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd will not scan them over and over again. This is achieved through various strategies: 1) add yet another page flag--PG_mlocked--to indicate that the page is locked for efficient testing in vmscan and, optionally, fault path. This allows early culling of unevictable pages, preventing them from getting to page_referenced()/try_to_unmap(). Also allows separate accounting of mlock'd pages, as Nick's original patch did. Note: Nick's original mlock patch used a PG_mlocked flag. I had removed this in favor of the PG_unevictable flag + an mlock_count [new page struct member]. I restored the PG_mlocked flag to eliminate the new count field. 2) add the mlock/unevictable infrastructure to mm/mlock.c, with internal APIs in mm/internal.h. This is a rework of Nick's original patch to these files, taking into account that mlocked pages are now kept on unevictable LRU list. 3) update vmscan.c:page_evictable() to check PageMlocked() and, if vma passed in, the vm_flags. Note that the vma will only be passed in for new pages in the fault path; and then only if the "cull unevictable pages in fault path" patch is included. 4) add try_to_unlock() to rmap.c to walk a page's rmap and ClearPageMlocked() if no other vmas have it mlocked. Reuses as much of try_to_unmap() as possible. This effectively replaces the use of one of the lru list links as an mlock count. If this mechanism let's pages in mlocked vmas leak through w/o PG_mlocked set [I don't know that it does], we should catch them later in try_to_unmap(). One hopes this will be rare, as it will be relatively expensive. Original mm/internal.h, mm/rmap.c and mm/mlock.c changes: Signed-off-by: Nick Piggin <npiggin@suse.de> splitlru: introduce __get_user_pages(): New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS. because current get_user_pages() can't grab PROT_NONE pages theresore it cause PROT_NONE pages can't munlock. [akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch] [akpm@linux-foundation.org: untangle patch interdependencies] [akpm@linux-foundation.org: fix things after out-of-order merging] [hugh@veritas.com: fix page-flags mess] [lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm'] [kosaki.motohiro@jp.fujitsu.com: build fix] [kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments] [kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()] Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Matt Mackall <mpm@selenic.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mm.h5
-rw-r--r--include/linux/page-flags.h19
-rw-r--r--include/linux/rmap.h14
-rw-r--r--mm/internal.h71
-rw-r--r--mm/memory.c56
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mlock.c394
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/nommu.c44
-rw-r--r--mm/page_alloc.c6
-rw-r--r--mm/rmap.c257
-rw-r--r--mm/swap.c2
-rw-r--r--mm/vmscan.c36
13 files changed, 817 insertions, 91 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 40236290e2ae..ffee2f743418 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -132,6 +132,11 @@ extern unsigned int kobjsize(const void *objp);
132#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) 132#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ)
133 133
134/* 134/*
135 * special vmas that are non-mergable, non-mlock()able
136 */
137#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
138
139/*
135 * mapping from the currently active vm_flags protection bits (the 140 * mapping from the currently active vm_flags protection bits (the
136 * low four bits) to a page protection mask.. 141 * low four bits) to a page protection mask..
137 */ 142 */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ec1a1baad348..b12f93a3c345 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -96,6 +96,7 @@ enum pageflags {
96 PG_swapbacked, /* Page is backed by RAM/swap */ 96 PG_swapbacked, /* Page is backed by RAM/swap */
97#ifdef CONFIG_UNEVICTABLE_LRU 97#ifdef CONFIG_UNEVICTABLE_LRU
98 PG_unevictable, /* Page is "unevictable" */ 98 PG_unevictable, /* Page is "unevictable" */
99 PG_mlocked, /* Page is vma mlocked */
99#endif 100#endif
100#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR 101#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
101 PG_uncached, /* Page has been mapped as uncached */ 102 PG_uncached, /* Page has been mapped as uncached */
@@ -232,7 +233,17 @@ PAGEFLAG_FALSE(SwapCache)
232#ifdef CONFIG_UNEVICTABLE_LRU 233#ifdef CONFIG_UNEVICTABLE_LRU
233PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) 234PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
234 TESTCLEARFLAG(Unevictable, unevictable) 235 TESTCLEARFLAG(Unevictable, unevictable)
236
237#define MLOCK_PAGES 1
238PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
239 TESTSCFLAG(Mlocked, mlocked)
240
235#else 241#else
242
243#define MLOCK_PAGES 0
244PAGEFLAG_FALSE(Mlocked)
245 SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked)
246
236PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable) 247PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
237 SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable) 248 SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
238 __CLEARPAGEFLAG_NOOP(Unevictable) 249 __CLEARPAGEFLAG_NOOP(Unevictable)
@@ -354,15 +365,17 @@ static inline void __ClearPageTail(struct page *page)
354#endif /* !PAGEFLAGS_EXTENDED */ 365#endif /* !PAGEFLAGS_EXTENDED */
355 366
356#ifdef CONFIG_UNEVICTABLE_LRU 367#ifdef CONFIG_UNEVICTABLE_LRU
357#define __PG_UNEVICTABLE (1 << PG_unevictable) 368#define __PG_UNEVICTABLE (1 << PG_unevictable)
369#define __PG_MLOCKED (1 << PG_mlocked)
358#else 370#else
359#define __PG_UNEVICTABLE 0 371#define __PG_UNEVICTABLE 0
372#define __PG_MLOCKED 0
360#endif 373#endif
361 374
362#define PAGE_FLAGS (1 << PG_lru | 1 << PG_private | 1 << PG_locked | \ 375#define PAGE_FLAGS (1 << PG_lru | 1 << PG_private | 1 << PG_locked | \
363 1 << PG_buddy | 1 << PG_writeback | \ 376 1 << PG_buddy | 1 << PG_writeback | \
364 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ 377 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
365 __PG_UNEVICTABLE) 378 __PG_UNEVICTABLE | __PG_MLOCKED)
366 379
367/* 380/*
368 * Flags checked in bad_page(). Pages on the free list should not have 381 * Flags checked in bad_page(). Pages on the free list should not have
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fed6f5e0b411..955667e6a52d 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -117,6 +117,19 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
117 */ 117 */
118int page_mkclean(struct page *); 118int page_mkclean(struct page *);
119 119
120#ifdef CONFIG_UNEVICTABLE_LRU
121/*
122 * called in munlock()/munmap() path to check for other vmas holding
123 * the page mlocked.
124 */
125int try_to_munlock(struct page *);
126#else
127static inline int try_to_munlock(struct page *page)
128{
129 return 0; /* a.k.a. SWAP_SUCCESS */
130}
131#endif
132
120#else /* !CONFIG_MMU */ 133#else /* !CONFIG_MMU */
121 134
122#define anon_vma_init() do {} while (0) 135#define anon_vma_init() do {} while (0)
@@ -140,5 +153,6 @@ static inline int page_mkclean(struct page *page)
140#define SWAP_SUCCESS 0 153#define SWAP_SUCCESS 0
141#define SWAP_AGAIN 1 154#define SWAP_AGAIN 1
142#define SWAP_FAIL 2 155#define SWAP_FAIL 2
156#define SWAP_MLOCK 3
143 157
144#endif /* _LINUX_RMAP_H */ 158#endif /* _LINUX_RMAP_H */
diff --git a/mm/internal.h b/mm/internal.h
index 3db17b2a1ac6..4ebf0bef9a39 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -61,6 +61,10 @@ static inline unsigned long page_order(struct page *page)
61 return page_private(page); 61 return page_private(page);
62} 62}
63 63
64extern int mlock_vma_pages_range(struct vm_area_struct *vma,
65 unsigned long start, unsigned long end);
66extern void munlock_vma_pages_all(struct vm_area_struct *vma);
67
64#ifdef CONFIG_UNEVICTABLE_LRU 68#ifdef CONFIG_UNEVICTABLE_LRU
65/* 69/*
66 * unevictable_migrate_page() called only from migrate_page_copy() to 70 * unevictable_migrate_page() called only from migrate_page_copy() to
@@ -79,6 +83,65 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
79} 83}
80#endif 84#endif
81 85
86#ifdef CONFIG_UNEVICTABLE_LRU
87/*
88 * Called only in fault path via page_evictable() for a new page
89 * to determine if it's being mapped into a LOCKED vma.
90 * If so, mark page as mlocked.
91 */
92static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
93{
94 VM_BUG_ON(PageLRU(page));
95
96 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
97 return 0;
98
99 SetPageMlocked(page);
100 return 1;
101}
102
103/*
104 * must be called with vma's mmap_sem held for read, and page locked.
105 */
106extern void mlock_vma_page(struct page *page);
107
108/*
109 * Clear the page's PageMlocked(). This can be useful in a situation where
110 * we want to unconditionally remove a page from the pagecache -- e.g.,
111 * on truncation or freeing.
112 *
113 * It is legal to call this function for any page, mlocked or not.
114 * If called for a page that is still mapped by mlocked vmas, all we do
115 * is revert to lazy LRU behaviour -- semantics are not broken.
116 */
117extern void __clear_page_mlock(struct page *page);
118static inline void clear_page_mlock(struct page *page)
119{
120 if (unlikely(TestClearPageMlocked(page)))
121 __clear_page_mlock(page);
122}
123
124/*
125 * mlock_migrate_page - called only from migrate_page_copy() to
126 * migrate the Mlocked page flag
127 */
128static inline void mlock_migrate_page(struct page *newpage, struct page *page)
129{
130 if (TestClearPageMlocked(page))
131 SetPageMlocked(newpage);
132}
133
134
135#else /* CONFIG_UNEVICTABLE_LRU */
136static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
137{
138 return 0;
139}
140static inline void clear_page_mlock(struct page *page) { }
141static inline void mlock_vma_page(struct page *page) { }
142static inline void mlock_migrate_page(struct page *new, struct page *old) { }
143
144#endif /* CONFIG_UNEVICTABLE_LRU */
82 145
83/* 146/*
84 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, 147 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
@@ -148,4 +211,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
148} 211}
149#endif /* CONFIG_SPARSEMEM */ 212#endif /* CONFIG_SPARSEMEM */
150 213
214#define GUP_FLAGS_WRITE 0x1
215#define GUP_FLAGS_FORCE 0x2
216#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
217
218int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
219 unsigned long start, int len, int flags,
220 struct page **pages, struct vm_area_struct **vmas);
221
151#endif 222#endif
diff --git a/mm/memory.c b/mm/memory.c
index 71cdefd1ef14..9fef7272fb9e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -64,6 +64,8 @@
64 64
65#include "internal.h" 65#include "internal.h"
66 66
67#include "internal.h"
68
67#ifndef CONFIG_NEED_MULTIPLE_NODES 69#ifndef CONFIG_NEED_MULTIPLE_NODES
68/* use the per-pgdat data instead for discontigmem - mbligh */ 70/* use the per-pgdat data instead for discontigmem - mbligh */
69unsigned long max_mapnr; 71unsigned long max_mapnr;
@@ -1129,12 +1131,17 @@ static inline int use_zero_page(struct vm_area_struct *vma)
1129 return !vma->vm_ops || !vma->vm_ops->fault; 1131 return !vma->vm_ops || !vma->vm_ops->fault;
1130} 1132}
1131 1133
1132int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1134
1133 unsigned long start, int len, int write, int force, 1135
1136int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1137 unsigned long start, int len, int flags,
1134 struct page **pages, struct vm_area_struct **vmas) 1138 struct page **pages, struct vm_area_struct **vmas)
1135{ 1139{
1136 int i; 1140 int i;
1137 unsigned int vm_flags; 1141 unsigned int vm_flags = 0;
1142 int write = !!(flags & GUP_FLAGS_WRITE);
1143 int force = !!(flags & GUP_FLAGS_FORCE);
1144 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1138 1145
1139 if (len <= 0) 1146 if (len <= 0)
1140 return 0; 1147 return 0;
@@ -1158,7 +1165,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1158 pud_t *pud; 1165 pud_t *pud;
1159 pmd_t *pmd; 1166 pmd_t *pmd;
1160 pte_t *pte; 1167 pte_t *pte;
1161 if (write) /* user gate pages are read-only */ 1168
1169 /* user gate pages are read-only */
1170 if (!ignore && write)
1162 return i ? : -EFAULT; 1171 return i ? : -EFAULT;
1163 if (pg > TASK_SIZE) 1172 if (pg > TASK_SIZE)
1164 pgd = pgd_offset_k(pg); 1173 pgd = pgd_offset_k(pg);
@@ -1190,8 +1199,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1190 continue; 1199 continue;
1191 } 1200 }
1192 1201
1193 if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1202 if (!vma ||
1194 || !(vm_flags & vma->vm_flags)) 1203 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1204 (!ignore && !(vm_flags & vma->vm_flags)))
1195 return i ? : -EFAULT; 1205 return i ? : -EFAULT;
1196 1206
1197 if (is_vm_hugetlb_page(vma)) { 1207 if (is_vm_hugetlb_page(vma)) {
@@ -1266,6 +1276,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1266 } while (len); 1276 } while (len);
1267 return i; 1277 return i;
1268} 1278}
1279
1280int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1281 unsigned long start, int len, int write, int force,
1282 struct page **pages, struct vm_area_struct **vmas)
1283{
1284 int flags = 0;
1285
1286 if (write)
1287 flags |= GUP_FLAGS_WRITE;
1288 if (force)
1289 flags |= GUP_FLAGS_FORCE;
1290
1291 return __get_user_pages(tsk, mm,
1292 start, len, flags,
1293 pages, vmas);
1294}
1295
1269EXPORT_SYMBOL(get_user_pages); 1296EXPORT_SYMBOL(get_user_pages);
1270 1297
1271pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1298pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@ -1858,6 +1885,15 @@ gotten:
1858 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1885 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1859 if (!new_page) 1886 if (!new_page)
1860 goto oom; 1887 goto oom;
1888 /*
1889 * Don't let another task, with possibly unlocked vma,
1890 * keep the mlocked page.
1891 */
1892 if (vma->vm_flags & VM_LOCKED) {
1893 lock_page(old_page); /* for LRU manipulation */
1894 clear_page_mlock(old_page);
1895 unlock_page(old_page);
1896 }
1861 cow_user_page(new_page, old_page, address, vma); 1897 cow_user_page(new_page, old_page, address, vma);
1862 __SetPageUptodate(new_page); 1898 __SetPageUptodate(new_page);
1863 1899
@@ -2325,7 +2361,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2325 page_add_anon_rmap(page, vma, address); 2361 page_add_anon_rmap(page, vma, address);
2326 2362
2327 swap_free(entry); 2363 swap_free(entry);
2328 if (vm_swap_full()) 2364 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2329 remove_exclusive_swap_page(page); 2365 remove_exclusive_swap_page(page);
2330 unlock_page(page); 2366 unlock_page(page);
2331 2367
@@ -2465,6 +2501,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2465 ret = VM_FAULT_OOM; 2501 ret = VM_FAULT_OOM;
2466 goto out; 2502 goto out;
2467 } 2503 }
2504 /*
2505 * Don't let another task, with possibly unlocked vma,
2506 * keep the mlocked page.
2507 */
2508 if (vma->vm_flags & VM_LOCKED)
2509 clear_page_mlock(vmf.page);
2468 copy_user_highpage(page, vmf.page, address, vma); 2510 copy_user_highpage(page, vmf.page, address, vma);
2469 __SetPageUptodate(page); 2511 __SetPageUptodate(page);
2470 } else { 2512 } else {
diff --git a/mm/migrate.c b/mm/migrate.c
index b10237d8b459..6802a7a3dfec 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -371,6 +371,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
371 __set_page_dirty_nobuffers(newpage); 371 __set_page_dirty_nobuffers(newpage);
372 } 372 }
373 373
374 mlock_migrate_page(newpage, page);
375
374#ifdef CONFIG_SWAP 376#ifdef CONFIG_SWAP
375 ClearPageSwapCache(page); 377 ClearPageSwapCache(page);
376#endif 378#endif
diff --git a/mm/mlock.c b/mm/mlock.c
index 01fbe93eff5c..8746fe3f9730 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -8,10 +8,18 @@
8#include <linux/capability.h> 8#include <linux/capability.h>
9#include <linux/mman.h> 9#include <linux/mman.h>
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/swapops.h>
13#include <linux/pagemap.h>
11#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
12#include <linux/syscalls.h> 15#include <linux/syscalls.h>
13#include <linux/sched.h> 16#include <linux/sched.h>
14#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/rmap.h>
19#include <linux/mmzone.h>
20#include <linux/hugetlb.h>
21
22#include "internal.h"
15 23
16int can_do_mlock(void) 24int can_do_mlock(void)
17{ 25{
@@ -23,17 +31,360 @@ int can_do_mlock(void)
23} 31}
24EXPORT_SYMBOL(can_do_mlock); 32EXPORT_SYMBOL(can_do_mlock);
25 33
34#ifdef CONFIG_UNEVICTABLE_LRU
35/*
36 * Mlocked pages are marked with PageMlocked() flag for efficient testing
37 * in vmscan and, possibly, the fault path; and to support semi-accurate
38 * statistics.
39 *
40 * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
41 * be placed on the LRU "unevictable" list, rather than the [in]active lists.
42 * The unevictable list is an LRU sibling list to the [in]active lists.
43 * PageUnevictable is set to indicate the unevictable state.
44 *
45 * When lazy mlocking via vmscan, it is important to ensure that the
46 * vma's VM_LOCKED status is not concurrently being modified, otherwise we
47 * may have mlocked a page that is being munlocked. So lazy mlock must take
48 * the mmap_sem for read, and verify that the vma really is locked
49 * (see mm/rmap.c).
50 */
51
52/*
53 * LRU accounting for clear_page_mlock()
54 */
55void __clear_page_mlock(struct page *page)
56{
57 VM_BUG_ON(!PageLocked(page));
58
59 if (!page->mapping) { /* truncated ? */
60 return;
61 }
62
63 if (!isolate_lru_page(page)) {
64 putback_lru_page(page);
65 } else {
66 /*
67 * Page not on the LRU yet. Flush all pagevecs and retry.
68 */
69 lru_add_drain_all();
70 if (!isolate_lru_page(page))
71 putback_lru_page(page);
72 }
73}
74
75/*
76 * Mark page as mlocked if not already.
77 * If page on LRU, isolate and putback to move to unevictable list.
78 */
79void mlock_vma_page(struct page *page)
80{
81 BUG_ON(!PageLocked(page));
82
83 if (!TestSetPageMlocked(page) && !isolate_lru_page(page))
84 putback_lru_page(page);
85}
86
87/*
88 * called from munlock()/munmap() path with page supposedly on the LRU.
89 *
90 * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked
91 * [in try_to_munlock()] and then attempt to isolate the page. We must
92 * isolate the page to keep others from messing with its unevictable
93 * and mlocked state while trying to munlock. However, we pre-clear the
94 * mlocked state anyway as we might lose the isolation race and we might
95 * not get another chance to clear PageMlocked. If we successfully
96 * isolate the page and try_to_munlock() detects other VM_LOCKED vmas
97 * mapping the page, it will restore the PageMlocked state, unless the page
98 * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(),
99 * perhaps redundantly.
100 * If we lose the isolation race, and the page is mapped by other VM_LOCKED
101 * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
102 * either of which will restore the PageMlocked state by calling
103 * mlock_vma_page() above, if it can grab the vma's mmap sem.
104 */
105static void munlock_vma_page(struct page *page)
106{
107 BUG_ON(!PageLocked(page));
108
109 if (TestClearPageMlocked(page) && !isolate_lru_page(page)) {
110 try_to_munlock(page);
111 putback_lru_page(page);
112 }
113}
114
115/*
116 * mlock a range of pages in the vma.
117 *
118 * This takes care of making the pages present too.
119 *
120 * vma->vm_mm->mmap_sem must be held for write.
121 */
122static int __mlock_vma_pages_range(struct vm_area_struct *vma,
123 unsigned long start, unsigned long end)
124{
125 struct mm_struct *mm = vma->vm_mm;
126 unsigned long addr = start;
127 struct page *pages[16]; /* 16 gives a reasonable batch */
128 int write = !!(vma->vm_flags & VM_WRITE);
129 int nr_pages = (end - start) / PAGE_SIZE;
130 int ret;
131
132 VM_BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK);
133 VM_BUG_ON(start < vma->vm_start || end > vma->vm_end);
134 VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
135
136 lru_add_drain_all(); /* push cached pages to LRU */
137
138 while (nr_pages > 0) {
139 int i;
140
141 cond_resched();
142
143 /*
144 * get_user_pages makes pages present if we are
145 * setting mlock. and this extra reference count will
146 * disable migration of this page. However, page may
147 * still be truncated out from under us.
148 */
149 ret = get_user_pages(current, mm, addr,
150 min_t(int, nr_pages, ARRAY_SIZE(pages)),
151 write, 0, pages, NULL);
152 /*
153 * This can happen for, e.g., VM_NONLINEAR regions before
154 * a page has been allocated and mapped at a given offset,
155 * or for addresses that map beyond end of a file.
156 * We'll mlock the the pages if/when they get faulted in.
157 */
158 if (ret < 0)
159 break;
160 if (ret == 0) {
161 /*
162 * We know the vma is there, so the only time
163 * we cannot get a single page should be an
164 * error (ret < 0) case.
165 */
166 WARN_ON(1);
167 break;
168 }
169
170 lru_add_drain(); /* push cached pages to LRU */
171
172 for (i = 0; i < ret; i++) {
173 struct page *page = pages[i];
174
175 lock_page(page);
176 /*
177 * Because we lock page here and migration is blocked
178 * by the elevated reference, we need only check for
179 * page truncation (file-cache only).
180 */
181 if (page->mapping)
182 mlock_vma_page(page);
183 unlock_page(page);
184 put_page(page); /* ref from get_user_pages() */
185
186 /*
187 * here we assume that get_user_pages() has given us
188 * a list of virtually contiguous pages.
189 */
190 addr += PAGE_SIZE; /* for next get_user_pages() */
191 nr_pages--;
192 }
193 }
194
195 lru_add_drain_all(); /* to update stats */
196
197 return 0; /* count entire vma as locked_vm */
198}
199
200/*
201 * private structure for munlock page table walk
202 */
203struct munlock_page_walk {
204 struct vm_area_struct *vma;
205 pmd_t *pmd; /* for migration_entry_wait() */
206};
207
208/*
209 * munlock normal pages for present ptes
210 */
211static int __munlock_pte_handler(pte_t *ptep, unsigned long addr,
212 unsigned long end, struct mm_walk *walk)
213{
214 struct munlock_page_walk *mpw = walk->private;
215 swp_entry_t entry;
216 struct page *page;
217 pte_t pte;
218
219retry:
220 pte = *ptep;
221 /*
222 * If it's a swap pte, we might be racing with page migration.
223 */
224 if (unlikely(!pte_present(pte))) {
225 if (!is_swap_pte(pte))
226 goto out;
227 entry = pte_to_swp_entry(pte);
228 if (is_migration_entry(entry)) {
229 migration_entry_wait(mpw->vma->vm_mm, mpw->pmd, addr);
230 goto retry;
231 }
232 goto out;
233 }
234
235 page = vm_normal_page(mpw->vma, addr, pte);
236 if (!page)
237 goto out;
238
239 lock_page(page);
240 if (!page->mapping) {
241 unlock_page(page);
242 goto retry;
243 }
244 munlock_vma_page(page);
245 unlock_page(page);
246
247out:
248 return 0;
249}
250
251/*
252 * Save pmd for pte handler for waiting on migration entries
253 */
254static int __munlock_pmd_handler(pmd_t *pmd, unsigned long addr,
255 unsigned long end, struct mm_walk *walk)
256{
257 struct munlock_page_walk *mpw = walk->private;
258
259 mpw->pmd = pmd;
260 return 0;
261}
262
263
264/*
265 * munlock a range of pages in the vma using standard page table walk.
266 *
267 * vma->vm_mm->mmap_sem must be held for write.
268 */
269static void __munlock_vma_pages_range(struct vm_area_struct *vma,
270 unsigned long start, unsigned long end)
271{
272 struct mm_struct *mm = vma->vm_mm;
273 struct munlock_page_walk mpw = {
274 .vma = vma,
275 };
276 struct mm_walk munlock_page_walk = {
277 .pmd_entry = __munlock_pmd_handler,
278 .pte_entry = __munlock_pte_handler,
279 .private = &mpw,
280 .mm = mm,
281 };
282
283 VM_BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK);
284 VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
285 VM_BUG_ON(start < vma->vm_start);
286 VM_BUG_ON(end > vma->vm_end);
287
288 lru_add_drain_all(); /* push cached pages to LRU */
289 walk_page_range(start, end, &munlock_page_walk);
290 lru_add_drain_all(); /* to update stats */
291}
292
293#else /* CONFIG_UNEVICTABLE_LRU */
294
295/*
296 * Just make pages present if VM_LOCKED. No-op if unlocking.
297 */
298static int __mlock_vma_pages_range(struct vm_area_struct *vma,
299 unsigned long start, unsigned long end)
300{
301 if (vma->vm_flags & VM_LOCKED)
302 make_pages_present(start, end);
303 return 0;
304}
305
306/*
307 * munlock a range of pages in the vma -- no-op.
308 */
309static void __munlock_vma_pages_range(struct vm_area_struct *vma,
310 unsigned long start, unsigned long end)
311{
312}
313#endif /* CONFIG_UNEVICTABLE_LRU */
314
315/*
316 * mlock all pages in this vma range. For mmap()/mremap()/...
317 */
318int mlock_vma_pages_range(struct vm_area_struct *vma,
319 unsigned long start, unsigned long end)
320{
321 int nr_pages = (end - start) / PAGE_SIZE;
322 BUG_ON(!(vma->vm_flags & VM_LOCKED));
323
324 /*
325 * filter unlockable vmas
326 */
327 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
328 goto no_mlock;
329
330 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
331 is_vm_hugetlb_page(vma) ||
332 vma == get_gate_vma(current)))
333 return __mlock_vma_pages_range(vma, start, end);
334
335 /*
336 * User mapped kernel pages or huge pages:
337 * make these pages present to populate the ptes, but
338 * fall thru' to reset VM_LOCKED--no need to unlock, and
339 * return nr_pages so these don't get counted against task's
340 * locked limit. huge pages are already counted against
341 * locked vm limit.
342 */
343 make_pages_present(start, end);
344
345no_mlock:
346 vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
347 return nr_pages; /* pages NOT mlocked */
348}
349
350
351/*
352 * munlock all pages in vma. For munmap() and exit().
353 */
354void munlock_vma_pages_all(struct vm_area_struct *vma)
355{
356 vma->vm_flags &= ~VM_LOCKED;
357 __munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
358}
359
360/*
361 * mlock_fixup - handle mlock[all]/munlock[all] requests.
362 *
363 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
364 * munlock is a no-op. However, for some special vmas, we go ahead and
365 * populate the ptes via make_pages_present().
366 *
367 * For vmas that pass the filters, merge/split as appropriate.
368 */
26static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, 369static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
27 unsigned long start, unsigned long end, unsigned int newflags) 370 unsigned long start, unsigned long end, unsigned int newflags)
28{ 371{
29 struct mm_struct * mm = vma->vm_mm; 372 struct mm_struct *mm = vma->vm_mm;
30 pgoff_t pgoff; 373 pgoff_t pgoff;
31 int pages; 374 int nr_pages;
32 int ret = 0; 375 int ret = 0;
33 376 int lock = newflags & VM_LOCKED;
34 if (newflags == vma->vm_flags) { 377
35 *prev = vma; 378 if (newflags == vma->vm_flags ||
36 goto out; 379 (vma->vm_flags & (VM_IO | VM_PFNMAP)))
380 goto out; /* don't set VM_LOCKED, don't count */
381
382 if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
383 is_vm_hugetlb_page(vma) ||
384 vma == get_gate_vma(current)) {
385 if (lock)
386 make_pages_present(start, end);
387 goto out; /* don't set VM_LOCKED, don't count */
37 } 388 }
38 389
39 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 390 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -44,8 +395,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
44 goto success; 395 goto success;
45 } 396 }
46 397
47 *prev = vma;
48
49 if (start != vma->vm_start) { 398 if (start != vma->vm_start) {
50 ret = split_vma(mm, vma, start, 1); 399 ret = split_vma(mm, vma, start, 1);
51 if (ret) 400 if (ret)
@@ -60,24 +409,31 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
60 409
61success: 410success:
62 /* 411 /*
412 * Keep track of amount of locked VM.
413 */
414 nr_pages = (end - start) >> PAGE_SHIFT;
415 if (!lock)
416 nr_pages = -nr_pages;
417 mm->locked_vm += nr_pages;
418
419 /*
63 * vm_flags is protected by the mmap_sem held in write mode. 420 * vm_flags is protected by the mmap_sem held in write mode.
64 * It's okay if try_to_unmap_one unmaps a page just after we 421 * It's okay if try_to_unmap_one unmaps a page just after we
65 * set VM_LOCKED, make_pages_present below will bring it back. 422 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
66 */ 423 */
67 vma->vm_flags = newflags; 424 vma->vm_flags = newflags;
68 425
69 /* 426 if (lock) {
70 * Keep track of amount of locked VM. 427 ret = __mlock_vma_pages_range(vma, start, end);
71 */ 428 if (ret > 0) {
72 pages = (end - start) >> PAGE_SHIFT; 429 mm->locked_vm -= ret;
73 if (newflags & VM_LOCKED) { 430 ret = 0;
74 pages = -pages; 431 }
75 if (!(newflags & VM_IO)) 432 } else
76 ret = make_pages_present(start, end); 433 __munlock_vma_pages_range(vma, start, end);
77 }
78 434
79 mm->locked_vm -= pages;
80out: 435out:
436 *prev = vma;
81 return ret; 437 return ret;
82} 438}
83 439
diff --git a/mm/mmap.c b/mm/mmap.c
index e7a5a68a9c2e..7bdfd2661f17 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -662,8 +662,6 @@ again: remove_next = 1 + (end > next->vm_end);
662 * If the vma has a ->close operation then the driver probably needs to release 662 * If the vma has a ->close operation then the driver probably needs to release
663 * per-vma resources, so we don't attempt to merge those. 663 * per-vma resources, so we don't attempt to merge those.
664 */ 664 */
665#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
666
667static inline int is_mergeable_vma(struct vm_area_struct *vma, 665static inline int is_mergeable_vma(struct vm_area_struct *vma,
668 struct file *file, unsigned long vm_flags) 666 struct file *file, unsigned long vm_flags)
669{ 667{
diff --git a/mm/nommu.c b/mm/nommu.c
index ed75bc962fbe..2696b24f2bb3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -34,6 +34,8 @@
34#include <asm/tlb.h> 34#include <asm/tlb.h>
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36 36
37#include "internal.h"
38
37void *high_memory; 39void *high_memory;
38struct page *mem_map; 40struct page *mem_map;
39unsigned long max_mapnr; 41unsigned long max_mapnr;
@@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp)
128 return PAGE_SIZE << compound_order(page); 130 return PAGE_SIZE << compound_order(page);
129} 131}
130 132
131/* 133int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
132 * get a list of pages in an address range belonging to the specified process 134 unsigned long start, int len, int flags,
133 * and indicate the VMA that covers each page 135 struct page **pages, struct vm_area_struct **vmas)
134 * - this is potentially dodgy as we may end incrementing the page count of a
135 * slab page or a secondary page from a compound page
136 * - don't permit access to VMAs that don't support it, such as I/O mappings
137 */
138int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
139 unsigned long start, int len, int write, int force,
140 struct page **pages, struct vm_area_struct **vmas)
141{ 136{
142 struct vm_area_struct *vma; 137 struct vm_area_struct *vma;
143 unsigned long vm_flags; 138 unsigned long vm_flags;
144 int i; 139 int i;
140 int write = !!(flags & GUP_FLAGS_WRITE);
141 int force = !!(flags & GUP_FLAGS_FORCE);
142 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
145 143
146 /* calculate required read or write permissions. 144 /* calculate required read or write permissions.
147 * - if 'force' is set, we only require the "MAY" flags. 145 * - if 'force' is set, we only require the "MAY" flags.
@@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
156 154
157 /* protect what we can, including chardevs */ 155 /* protect what we can, including chardevs */
158 if (vma->vm_flags & (VM_IO | VM_PFNMAP) || 156 if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
159 !(vm_flags & vma->vm_flags)) 157 (!ignore && !(vm_flags & vma->vm_flags)))
160 goto finish_or_fault; 158 goto finish_or_fault;
161 159
162 if (pages) { 160 if (pages) {
@@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
174finish_or_fault: 172finish_or_fault:
175 return i ? : -EFAULT; 173 return i ? : -EFAULT;
176} 174}
175
176
177/*
178 * get a list of pages in an address range belonging to the specified process
179 * and indicate the VMA that covers each page
180 * - this is potentially dodgy as we may end incrementing the page count of a
181 * slab page or a secondary page from a compound page
182 * - don't permit access to VMAs that don't support it, such as I/O mappings
183 */
184int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
185 unsigned long start, int len, int write, int force,
186 struct page **pages, struct vm_area_struct **vmas)
187{
188 int flags = 0;
189
190 if (write)
191 flags |= GUP_FLAGS_WRITE;
192 if (force)
193 flags |= GUP_FLAGS_FORCE;
194
195 return __get_user_pages(tsk, mm,
196 start, len, flags,
197 pages, vmas);
198}
177EXPORT_SYMBOL(get_user_pages); 199EXPORT_SYMBOL(get_user_pages);
178 200
179DEFINE_RWLOCK(vmlist_lock); 201DEFINE_RWLOCK(vmlist_lock);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4125230a1b2c..5886586fde6c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -616,7 +616,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
616 616
617 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | 617 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
618 1 << PG_referenced | 1 << PG_arch_1 | 618 1 << PG_referenced | 1 << PG_arch_1 |
619 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); 619 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
620#ifdef CONFIG_UNEVICTABLE_LRU
621 | 1 << PG_mlocked
622#endif
623 );
620 set_page_private(page, 0); 624 set_page_private(page, 0);
621 set_page_refcounted(page); 625 set_page_refcounted(page);
622 626
diff --git a/mm/rmap.c b/mm/rmap.c
index e8d639b16c6d..7e60df99018e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -53,6 +53,8 @@
53 53
54#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
55 55
56#include "internal.h"
57
56struct kmem_cache *anon_vma_cachep; 58struct kmem_cache *anon_vma_cachep;
57 59
58/** 60/**
@@ -290,6 +292,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
290 return NULL; 292 return NULL;
291} 293}
292 294
295/**
296 * page_mapped_in_vma - check whether a page is really mapped in a VMA
297 * @page: the page to test
298 * @vma: the VMA to test
299 *
300 * Returns 1 if the page is mapped into the page tables of the VMA, 0
301 * if the page is not mapped into the page tables of this VMA. Only
302 * valid for normal file or anonymous VMAs.
303 */
304static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
305{
306 unsigned long address;
307 pte_t *pte;
308 spinlock_t *ptl;
309
310 address = vma_address(page, vma);
311 if (address == -EFAULT) /* out of vma range */
312 return 0;
313 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
314 if (!pte) /* the page is not in this mm */
315 return 0;
316 pte_unmap_unlock(pte, ptl);
317
318 return 1;
319}
320
293/* 321/*
294 * Subfunctions of page_referenced: page_referenced_one called 322 * Subfunctions of page_referenced: page_referenced_one called
295 * repeatedly from either page_referenced_anon or page_referenced_file. 323 * repeatedly from either page_referenced_anon or page_referenced_file.
@@ -311,10 +339,17 @@ static int page_referenced_one(struct page *page,
311 if (!pte) 339 if (!pte)
312 goto out; 340 goto out;
313 341
342 /*
343 * Don't want to elevate referenced for mlocked page that gets this far,
344 * in order that it progresses to try_to_unmap and is moved to the
345 * unevictable list.
346 */
314 if (vma->vm_flags & VM_LOCKED) { 347 if (vma->vm_flags & VM_LOCKED) {
315 referenced++;
316 *mapcount = 1; /* break early from loop */ 348 *mapcount = 1; /* break early from loop */
317 } else if (ptep_clear_flush_young_notify(vma, address, pte)) 349 goto out_unmap;
350 }
351
352 if (ptep_clear_flush_young_notify(vma, address, pte))
318 referenced++; 353 referenced++;
319 354
320 /* Pretend the page is referenced if the task has the 355 /* Pretend the page is referenced if the task has the
@@ -323,6 +358,7 @@ static int page_referenced_one(struct page *page,
323 rwsem_is_locked(&mm->mmap_sem)) 358 rwsem_is_locked(&mm->mmap_sem))
324 referenced++; 359 referenced++;
325 360
361out_unmap:
326 (*mapcount)--; 362 (*mapcount)--;
327 pte_unmap_unlock(pte, ptl); 363 pte_unmap_unlock(pte, ptl);
328out: 364out:
@@ -412,11 +448,6 @@ static int page_referenced_file(struct page *page,
412 */ 448 */
413 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 449 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
414 continue; 450 continue;
415 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
416 == (VM_LOCKED|VM_MAYSHARE)) {
417 referenced++;
418 break;
419 }
420 referenced += page_referenced_one(page, vma, &mapcount); 451 referenced += page_referenced_one(page, vma, &mapcount);
421 if (!mapcount) 452 if (!mapcount)
422 break; 453 break;
@@ -739,11 +770,16 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
739 * If it's recently referenced (perhaps page_referenced 770 * If it's recently referenced (perhaps page_referenced
740 * skipped over this mm) then we should reactivate it. 771 * skipped over this mm) then we should reactivate it.
741 */ 772 */
742 if (!migration && ((vma->vm_flags & VM_LOCKED) || 773 if (!migration) {
743 (ptep_clear_flush_young_notify(vma, address, pte)))) { 774 if (vma->vm_flags & VM_LOCKED) {
744 ret = SWAP_FAIL; 775 ret = SWAP_MLOCK;
745 goto out_unmap; 776 goto out_unmap;
746 } 777 }
778 if (ptep_clear_flush_young_notify(vma, address, pte)) {
779 ret = SWAP_FAIL;
780 goto out_unmap;
781 }
782 }
747 783
748 /* Nuke the page table entry. */ 784 /* Nuke the page table entry. */
749 flush_cache_page(vma, address, page_to_pfn(page)); 785 flush_cache_page(vma, address, page_to_pfn(page));
@@ -824,12 +860,17 @@ out:
824 * For very sparsely populated VMAs this is a little inefficient - chances are 860 * For very sparsely populated VMAs this is a little inefficient - chances are
825 * there there won't be many ptes located within the scan cluster. In this case 861 * there there won't be many ptes located within the scan cluster. In this case
826 * maybe we could scan further - to the end of the pte page, perhaps. 862 * maybe we could scan further - to the end of the pte page, perhaps.
863 *
864 * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
865 * acquire it without blocking. If vma locked, mlock the pages in the cluster,
866 * rather than unmapping them. If we encounter the "check_page" that vmscan is
867 * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
827 */ 868 */
828#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 869#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
829#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 870#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
830 871
831static void try_to_unmap_cluster(unsigned long cursor, 872static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
832 unsigned int *mapcount, struct vm_area_struct *vma) 873 struct vm_area_struct *vma, struct page *check_page)
833{ 874{
834 struct mm_struct *mm = vma->vm_mm; 875 struct mm_struct *mm = vma->vm_mm;
835 pgd_t *pgd; 876 pgd_t *pgd;
@@ -841,6 +882,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
841 struct page *page; 882 struct page *page;
842 unsigned long address; 883 unsigned long address;
843 unsigned long end; 884 unsigned long end;
885 int ret = SWAP_AGAIN;
886 int locked_vma = 0;
844 887
845 address = (vma->vm_start + cursor) & CLUSTER_MASK; 888 address = (vma->vm_start + cursor) & CLUSTER_MASK;
846 end = address + CLUSTER_SIZE; 889 end = address + CLUSTER_SIZE;
@@ -851,15 +894,26 @@ static void try_to_unmap_cluster(unsigned long cursor,
851 894
852 pgd = pgd_offset(mm, address); 895 pgd = pgd_offset(mm, address);
853 if (!pgd_present(*pgd)) 896 if (!pgd_present(*pgd))
854 return; 897 return ret;
855 898
856 pud = pud_offset(pgd, address); 899 pud = pud_offset(pgd, address);
857 if (!pud_present(*pud)) 900 if (!pud_present(*pud))
858 return; 901 return ret;
859 902
860 pmd = pmd_offset(pud, address); 903 pmd = pmd_offset(pud, address);
861 if (!pmd_present(*pmd)) 904 if (!pmd_present(*pmd))
862 return; 905 return ret;
906
907 /*
908 * MLOCK_PAGES => feature is configured.
909 * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
910 * keep the sem while scanning the cluster for mlocking pages.
911 */
912 if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
913 locked_vma = (vma->vm_flags & VM_LOCKED);
914 if (!locked_vma)
915 up_read(&vma->vm_mm->mmap_sem); /* don't need it */
916 }
863 917
864 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 918 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
865 919
@@ -872,6 +926,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
872 page = vm_normal_page(vma, address, *pte); 926 page = vm_normal_page(vma, address, *pte);
873 BUG_ON(!page || PageAnon(page)); 927 BUG_ON(!page || PageAnon(page));
874 928
929 if (locked_vma) {
930 mlock_vma_page(page); /* no-op if already mlocked */
931 if (page == check_page)
932 ret = SWAP_MLOCK;
933 continue; /* don't unmap */
934 }
935
875 if (ptep_clear_flush_young_notify(vma, address, pte)) 936 if (ptep_clear_flush_young_notify(vma, address, pte))
876 continue; 937 continue;
877 938
@@ -893,39 +954,104 @@ static void try_to_unmap_cluster(unsigned long cursor,
893 (*mapcount)--; 954 (*mapcount)--;
894 } 955 }
895 pte_unmap_unlock(pte - 1, ptl); 956 pte_unmap_unlock(pte - 1, ptl);
957 if (locked_vma)
958 up_read(&vma->vm_mm->mmap_sem);
959 return ret;
896} 960}
897 961
898static int try_to_unmap_anon(struct page *page, int migration) 962/*
963 * common handling for pages mapped in VM_LOCKED vmas
964 */
965static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
966{
967 int mlocked = 0;
968
969 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
970 if (vma->vm_flags & VM_LOCKED) {
971 mlock_vma_page(page);
972 mlocked++; /* really mlocked the page */
973 }
974 up_read(&vma->vm_mm->mmap_sem);
975 }
976 return mlocked;
977}
978
979/**
980 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
981 * rmap method
982 * @page: the page to unmap/unlock
983 * @unlock: request for unlock rather than unmap [unlikely]
984 * @migration: unmapping for migration - ignored if @unlock
985 *
986 * Find all the mappings of a page using the mapping pointer and the vma chains
987 * contained in the anon_vma struct it points to.
988 *
989 * This function is only called from try_to_unmap/try_to_munlock for
990 * anonymous pages.
991 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
992 * where the page was found will be held for write. So, we won't recheck
993 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
994 * 'LOCKED.
995 */
996static int try_to_unmap_anon(struct page *page, int unlock, int migration)
899{ 997{
900 struct anon_vma *anon_vma; 998 struct anon_vma *anon_vma;
901 struct vm_area_struct *vma; 999 struct vm_area_struct *vma;
1000 unsigned int mlocked = 0;
902 int ret = SWAP_AGAIN; 1001 int ret = SWAP_AGAIN;
903 1002
1003 if (MLOCK_PAGES && unlikely(unlock))
1004 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
1005
904 anon_vma = page_lock_anon_vma(page); 1006 anon_vma = page_lock_anon_vma(page);
905 if (!anon_vma) 1007 if (!anon_vma)
906 return ret; 1008 return ret;
907 1009
908 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1010 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
909 ret = try_to_unmap_one(page, vma, migration); 1011 if (MLOCK_PAGES && unlikely(unlock)) {
910 if (ret == SWAP_FAIL || !page_mapped(page)) 1012 if (!((vma->vm_flags & VM_LOCKED) &&
911 break; 1013 page_mapped_in_vma(page, vma)))
1014 continue; /* must visit all unlocked vmas */
1015 ret = SWAP_MLOCK; /* saw at least one mlocked vma */
1016 } else {
1017 ret = try_to_unmap_one(page, vma, migration);
1018 if (ret == SWAP_FAIL || !page_mapped(page))
1019 break;
1020 }
1021 if (ret == SWAP_MLOCK) {
1022 mlocked = try_to_mlock_page(page, vma);
1023 if (mlocked)
1024 break; /* stop if actually mlocked page */
1025 }
912 } 1026 }
913 1027
914 page_unlock_anon_vma(anon_vma); 1028 page_unlock_anon_vma(anon_vma);
1029
1030 if (mlocked)
1031 ret = SWAP_MLOCK; /* actually mlocked the page */
1032 else if (ret == SWAP_MLOCK)
1033 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
1034
915 return ret; 1035 return ret;
916} 1036}
917 1037
918/** 1038/**
919 * try_to_unmap_file - unmap file page using the object-based rmap method 1039 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
920 * @page: the page to unmap 1040 * @page: the page to unmap/unlock
921 * @migration: migration flag 1041 * @unlock: request for unlock rather than unmap [unlikely]
1042 * @migration: unmapping for migration - ignored if @unlock
922 * 1043 *
923 * Find all the mappings of a page using the mapping pointer and the vma chains 1044 * Find all the mappings of a page using the mapping pointer and the vma chains
924 * contained in the address_space struct it points to. 1045 * contained in the address_space struct it points to.
925 * 1046 *
926 * This function is only called from try_to_unmap for object-based pages. 1047 * This function is only called from try_to_unmap/try_to_munlock for
1048 * object-based pages.
1049 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1050 * where the page was found will be held for write. So, we won't recheck
1051 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1052 * 'LOCKED.
927 */ 1053 */
928static int try_to_unmap_file(struct page *page, int migration) 1054static int try_to_unmap_file(struct page *page, int unlock, int migration)
929{ 1055{
930 struct address_space *mapping = page->mapping; 1056 struct address_space *mapping = page->mapping;
931 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1057 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -936,20 +1062,44 @@ static int try_to_unmap_file(struct page *page, int migration)
936 unsigned long max_nl_cursor = 0; 1062 unsigned long max_nl_cursor = 0;
937 unsigned long max_nl_size = 0; 1063 unsigned long max_nl_size = 0;
938 unsigned int mapcount; 1064 unsigned int mapcount;
1065 unsigned int mlocked = 0;
1066
1067 if (MLOCK_PAGES && unlikely(unlock))
1068 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
939 1069
940 spin_lock(&mapping->i_mmap_lock); 1070 spin_lock(&mapping->i_mmap_lock);
941 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1071 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
942 ret = try_to_unmap_one(page, vma, migration); 1072 if (MLOCK_PAGES && unlikely(unlock)) {
943 if (ret == SWAP_FAIL || !page_mapped(page)) 1073 if (!(vma->vm_flags & VM_LOCKED))
944 goto out; 1074 continue; /* must visit all vmas */
1075 ret = SWAP_MLOCK;
1076 } else {
1077 ret = try_to_unmap_one(page, vma, migration);
1078 if (ret == SWAP_FAIL || !page_mapped(page))
1079 goto out;
1080 }
1081 if (ret == SWAP_MLOCK) {
1082 mlocked = try_to_mlock_page(page, vma);
1083 if (mlocked)
1084 break; /* stop if actually mlocked page */
1085 }
945 } 1086 }
946 1087
1088 if (mlocked)
1089 goto out;
1090
947 if (list_empty(&mapping->i_mmap_nonlinear)) 1091 if (list_empty(&mapping->i_mmap_nonlinear))
948 goto out; 1092 goto out;
949 1093
950 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1094 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
951 shared.vm_set.list) { 1095 shared.vm_set.list) {
952 if ((vma->vm_flags & VM_LOCKED) && !migration) 1096 if (MLOCK_PAGES && unlikely(unlock)) {
1097 if (!(vma->vm_flags & VM_LOCKED))
1098 continue; /* must visit all vmas */
1099 ret = SWAP_MLOCK; /* leave mlocked == 0 */
1100 goto out; /* no need to look further */
1101 }
1102 if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
953 continue; 1103 continue;
954 cursor = (unsigned long) vma->vm_private_data; 1104 cursor = (unsigned long) vma->vm_private_data;
955 if (cursor > max_nl_cursor) 1105 if (cursor > max_nl_cursor)
@@ -959,7 +1109,7 @@ static int try_to_unmap_file(struct page *page, int migration)
959 max_nl_size = cursor; 1109 max_nl_size = cursor;
960 } 1110 }
961 1111
962 if (max_nl_size == 0) { /* any nonlinears locked or reserved */ 1112 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
963 ret = SWAP_FAIL; 1113 ret = SWAP_FAIL;
964 goto out; 1114 goto out;
965 } 1115 }
@@ -983,12 +1133,16 @@ static int try_to_unmap_file(struct page *page, int migration)
983 do { 1133 do {
984 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1134 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
985 shared.vm_set.list) { 1135 shared.vm_set.list) {
986 if ((vma->vm_flags & VM_LOCKED) && !migration) 1136 if (!MLOCK_PAGES && !migration &&
1137 (vma->vm_flags & VM_LOCKED))
987 continue; 1138 continue;
988 cursor = (unsigned long) vma->vm_private_data; 1139 cursor = (unsigned long) vma->vm_private_data;
989 while ( cursor < max_nl_cursor && 1140 while ( cursor < max_nl_cursor &&
990 cursor < vma->vm_end - vma->vm_start) { 1141 cursor < vma->vm_end - vma->vm_start) {
991 try_to_unmap_cluster(cursor, &mapcount, vma); 1142 ret = try_to_unmap_cluster(cursor, &mapcount,
1143 vma, page);
1144 if (ret == SWAP_MLOCK)
1145 mlocked = 2; /* to return below */
992 cursor += CLUSTER_SIZE; 1146 cursor += CLUSTER_SIZE;
993 vma->vm_private_data = (void *) cursor; 1147 vma->vm_private_data = (void *) cursor;
994 if ((int)mapcount <= 0) 1148 if ((int)mapcount <= 0)
@@ -1009,6 +1163,10 @@ static int try_to_unmap_file(struct page *page, int migration)
1009 vma->vm_private_data = NULL; 1163 vma->vm_private_data = NULL;
1010out: 1164out:
1011 spin_unlock(&mapping->i_mmap_lock); 1165 spin_unlock(&mapping->i_mmap_lock);
1166 if (mlocked)
1167 ret = SWAP_MLOCK; /* actually mlocked the page */
1168 else if (ret == SWAP_MLOCK)
1169 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
1012 return ret; 1170 return ret;
1013} 1171}
1014 1172
@@ -1024,6 +1182,7 @@ out:
1024 * SWAP_SUCCESS - we succeeded in removing all mappings 1182 * SWAP_SUCCESS - we succeeded in removing all mappings
1025 * SWAP_AGAIN - we missed a mapping, try again later 1183 * SWAP_AGAIN - we missed a mapping, try again later
1026 * SWAP_FAIL - the page is unswappable 1184 * SWAP_FAIL - the page is unswappable
1185 * SWAP_MLOCK - page is mlocked.
1027 */ 1186 */
1028int try_to_unmap(struct page *page, int migration) 1187int try_to_unmap(struct page *page, int migration)
1029{ 1188{
@@ -1032,12 +1191,36 @@ int try_to_unmap(struct page *page, int migration)
1032 BUG_ON(!PageLocked(page)); 1191 BUG_ON(!PageLocked(page));
1033 1192
1034 if (PageAnon(page)) 1193 if (PageAnon(page))
1035 ret = try_to_unmap_anon(page, migration); 1194 ret = try_to_unmap_anon(page, 0, migration);
1036 else 1195 else
1037 ret = try_to_unmap_file(page, migration); 1196 ret = try_to_unmap_file(page, 0, migration);
1038 1197 if (ret != SWAP_MLOCK && !page_mapped(page))
1039 if (!page_mapped(page))
1040 ret = SWAP_SUCCESS; 1198 ret = SWAP_SUCCESS;
1041 return ret; 1199 return ret;
1042} 1200}
1043 1201
1202#ifdef CONFIG_UNEVICTABLE_LRU
1203/**
1204 * try_to_munlock - try to munlock a page
1205 * @page: the page to be munlocked
1206 *
1207 * Called from munlock code. Checks all of the VMAs mapping the page
1208 * to make sure nobody else has this page mlocked. The page will be
1209 * returned with PG_mlocked cleared if no other vmas have it mlocked.
1210 *
1211 * Return values are:
1212 *
1213 * SWAP_SUCCESS - no vma's holding page mlocked.
1214 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
1215 * SWAP_MLOCK - page is now mlocked.
1216 */
1217int try_to_munlock(struct page *page)
1218{
1219 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1220
1221 if (PageAnon(page))
1222 return try_to_unmap_anon(page, 1, 0);
1223 else
1224 return try_to_unmap_file(page, 1, 0);
1225}
1226#endif
diff --git a/mm/swap.c b/mm/swap.c
index fee6b973f143..bc58c1369dd6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -278,7 +278,7 @@ void lru_add_drain(void)
278 put_cpu(); 278 put_cpu();
279} 279}
280 280
281#ifdef CONFIG_NUMA 281#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU)
282static void lru_add_drain_per_cpu(struct work_struct *dummy) 282static void lru_add_drain_per_cpu(struct work_struct *dummy)
283{ 283{
284 lru_add_drain(); 284 lru_add_drain();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dfb342e0db9b..e5aaaad159ef 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -582,11 +582,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
582 582
583 sc->nr_scanned++; 583 sc->nr_scanned++;
584 584
585 if (unlikely(!page_evictable(page, NULL))) { 585 if (unlikely(!page_evictable(page, NULL)))
586 unlock_page(page); 586 goto cull_mlocked;
587 putback_lru_page(page);
588 continue;
589 }
590 587
591 if (!sc->may_swap && page_mapped(page)) 588 if (!sc->may_swap && page_mapped(page))
592 goto keep_locked; 589 goto keep_locked;
@@ -624,9 +621,19 @@ static unsigned long shrink_page_list(struct list_head *page_list,
624 * Anonymous process memory has backing store? 621 * Anonymous process memory has backing store?
625 * Try to allocate it some swap space here. 622 * Try to allocate it some swap space here.
626 */ 623 */
627 if (PageAnon(page) && !PageSwapCache(page)) 624 if (PageAnon(page) && !PageSwapCache(page)) {
625 switch (try_to_munlock(page)) {
626 case SWAP_FAIL: /* shouldn't happen */
627 case SWAP_AGAIN:
628 goto keep_locked;
629 case SWAP_MLOCK:
630 goto cull_mlocked;
631 case SWAP_SUCCESS:
632 ; /* fall thru'; add to swap cache */
633 }
628 if (!add_to_swap(page, GFP_ATOMIC)) 634 if (!add_to_swap(page, GFP_ATOMIC))
629 goto activate_locked; 635 goto activate_locked;
636 }
630#endif /* CONFIG_SWAP */ 637#endif /* CONFIG_SWAP */
631 638
632 mapping = page_mapping(page); 639 mapping = page_mapping(page);
@@ -641,6 +648,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
641 goto activate_locked; 648 goto activate_locked;
642 case SWAP_AGAIN: 649 case SWAP_AGAIN:
643 goto keep_locked; 650 goto keep_locked;
651 case SWAP_MLOCK:
652 goto cull_mlocked;
644 case SWAP_SUCCESS: 653 case SWAP_SUCCESS:
645 ; /* try to free the page below */ 654 ; /* try to free the page below */
646 } 655 }
@@ -731,6 +740,11 @@ free_it:
731 } 740 }
732 continue; 741 continue;
733 742
743cull_mlocked:
744 unlock_page(page);
745 putback_lru_page(page);
746 continue;
747
734activate_locked: 748activate_locked:
735 /* Not a candidate for swapping, so reclaim swap space. */ 749 /* Not a candidate for swapping, so reclaim swap space. */
736 if (PageSwapCache(page) && vm_swap_full()) 750 if (PageSwapCache(page) && vm_swap_full())
@@ -742,7 +756,7 @@ keep_locked:
742 unlock_page(page); 756 unlock_page(page);
743keep: 757keep:
744 list_add(&page->lru, &ret_pages); 758 list_add(&page->lru, &ret_pages);
745 VM_BUG_ON(PageLRU(page)); 759 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
746 } 760 }
747 list_splice(&ret_pages, page_list); 761 list_splice(&ret_pages, page_list);
748 if (pagevec_count(&freed_pvec)) 762 if (pagevec_count(&freed_pvec))
@@ -2329,12 +2343,13 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2329 * @vma: the VMA in which the page is or will be mapped, may be NULL 2343 * @vma: the VMA in which the page is or will be mapped, may be NULL
2330 * 2344 *
2331 * Test whether page is evictable--i.e., should be placed on active/inactive 2345 * Test whether page is evictable--i.e., should be placed on active/inactive
2332 * lists vs unevictable list. 2346 * lists vs unevictable list. The vma argument is !NULL when called from the
2347 * fault path to determine how to instantate a new page.
2333 * 2348 *
2334 * Reasons page might not be evictable: 2349 * Reasons page might not be evictable:
2335 * (1) page's mapping marked unevictable 2350 * (1) page's mapping marked unevictable
2351 * (2) page is part of an mlocked VMA
2336 * 2352 *
2337 * TODO - later patches
2338 */ 2353 */
2339int page_evictable(struct page *page, struct vm_area_struct *vma) 2354int page_evictable(struct page *page, struct vm_area_struct *vma)
2340{ 2355{
@@ -2342,7 +2357,8 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
2342 if (mapping_unevictable(page_mapping(page))) 2357 if (mapping_unevictable(page_mapping(page)))
2343 return 0; 2358 return 0;
2344 2359
2345 /* TODO: test page [!]evictable conditions */ 2360 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
2361 return 0;
2346 2362
2347 return 1; 2363 return 1;
2348} 2364}