aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/mm.h5
-rw-r--r--include/linux/page-flags.h19
-rw-r--r--include/linux/rmap.h14
-rw-r--r--mm/internal.h71
-rw-r--r--mm/memory.c56
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mlock.c394
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/nommu.c44
-rw-r--r--mm/page_alloc.c6
-rw-r--r--mm/rmap.c257
-rw-r--r--mm/swap.c2
-rw-r--r--mm/vmscan.c36
13 files changed, 817 insertions, 91 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 40236290e2ae..ffee2f743418 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -132,6 +132,11 @@ extern unsigned int kobjsize(const void *objp);
132#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) 132#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ)
133 133
134/* 134/*
135 * special vmas that are non-mergable, non-mlock()able
136 */
137#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
138
139/*
135 * mapping from the currently active vm_flags protection bits (the 140 * mapping from the currently active vm_flags protection bits (the
136 * low four bits) to a page protection mask.. 141 * low four bits) to a page protection mask..
137 */ 142 */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ec1a1baad348..b12f93a3c345 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -96,6 +96,7 @@ enum pageflags {
96 PG_swapbacked, /* Page is backed by RAM/swap */ 96 PG_swapbacked, /* Page is backed by RAM/swap */
97#ifdef CONFIG_UNEVICTABLE_LRU 97#ifdef CONFIG_UNEVICTABLE_LRU
98 PG_unevictable, /* Page is "unevictable" */ 98 PG_unevictable, /* Page is "unevictable" */
99 PG_mlocked, /* Page is vma mlocked */
99#endif 100#endif
100#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR 101#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
101 PG_uncached, /* Page has been mapped as uncached */ 102 PG_uncached, /* Page has been mapped as uncached */
@@ -232,7 +233,17 @@ PAGEFLAG_FALSE(SwapCache)
232#ifdef CONFIG_UNEVICTABLE_LRU 233#ifdef CONFIG_UNEVICTABLE_LRU
233PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) 234PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
234 TESTCLEARFLAG(Unevictable, unevictable) 235 TESTCLEARFLAG(Unevictable, unevictable)
236
237#define MLOCK_PAGES 1
238PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
239 TESTSCFLAG(Mlocked, mlocked)
240
235#else 241#else
242
243#define MLOCK_PAGES 0
244PAGEFLAG_FALSE(Mlocked)
245 SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked)
246
236PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable) 247PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
237 SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable) 248 SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
238 __CLEARPAGEFLAG_NOOP(Unevictable) 249 __CLEARPAGEFLAG_NOOP(Unevictable)
@@ -354,15 +365,17 @@ static inline void __ClearPageTail(struct page *page)
354#endif /* !PAGEFLAGS_EXTENDED */ 365#endif /* !PAGEFLAGS_EXTENDED */
355 366
356#ifdef CONFIG_UNEVICTABLE_LRU 367#ifdef CONFIG_UNEVICTABLE_LRU
357#define __PG_UNEVICTABLE (1 << PG_unevictable) 368#define __PG_UNEVICTABLE (1 << PG_unevictable)
369#define __PG_MLOCKED (1 << PG_mlocked)
358#else 370#else
359#define __PG_UNEVICTABLE 0 371#define __PG_UNEVICTABLE 0
372#define __PG_MLOCKED 0
360#endif 373#endif
361 374
362#define PAGE_FLAGS (1 << PG_lru | 1 << PG_private | 1 << PG_locked | \ 375#define PAGE_FLAGS (1 << PG_lru | 1 << PG_private | 1 << PG_locked | \
363 1 << PG_buddy | 1 << PG_writeback | \ 376 1 << PG_buddy | 1 << PG_writeback | \
364 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ 377 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
365 __PG_UNEVICTABLE) 378 __PG_UNEVICTABLE | __PG_MLOCKED)
366 379
367/* 380/*
368 * Flags checked in bad_page(). Pages on the free list should not have 381 * Flags checked in bad_page(). Pages on the free list should not have
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fed6f5e0b411..955667e6a52d 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -117,6 +117,19 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
117 */ 117 */
118int page_mkclean(struct page *); 118int page_mkclean(struct page *);
119 119
120#ifdef CONFIG_UNEVICTABLE_LRU
121/*
122 * called in munlock()/munmap() path to check for other vmas holding
123 * the page mlocked.
124 */
125int try_to_munlock(struct page *);
126#else
127static inline int try_to_munlock(struct page *page)
128{
129 return 0; /* a.k.a. SWAP_SUCCESS */
130}
131#endif
132
120#else /* !CONFIG_MMU */ 133#else /* !CONFIG_MMU */
121 134
122#define anon_vma_init() do {} while (0) 135#define anon_vma_init() do {} while (0)
@@ -140,5 +153,6 @@ static inline int page_mkclean(struct page *page)
140#define SWAP_SUCCESS 0 153#define SWAP_SUCCESS 0
141#define SWAP_AGAIN 1 154#define SWAP_AGAIN 1
142#define SWAP_FAIL 2 155#define SWAP_FAIL 2
156#define SWAP_MLOCK 3
143 157
144#endif /* _LINUX_RMAP_H */ 158#endif /* _LINUX_RMAP_H */
diff --git a/mm/internal.h b/mm/internal.h
index 3db17b2a1ac6..4ebf0bef9a39 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -61,6 +61,10 @@ static inline unsigned long page_order(struct page *page)
61 return page_private(page); 61 return page_private(page);
62} 62}
63 63
64extern int mlock_vma_pages_range(struct vm_area_struct *vma,
65 unsigned long start, unsigned long end);
66extern void munlock_vma_pages_all(struct vm_area_struct *vma);
67
64#ifdef CONFIG_UNEVICTABLE_LRU 68#ifdef CONFIG_UNEVICTABLE_LRU
65/* 69/*
66 * unevictable_migrate_page() called only from migrate_page_copy() to 70 * unevictable_migrate_page() called only from migrate_page_copy() to
@@ -79,6 +83,65 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
79} 83}
80#endif 84#endif
81 85
86#ifdef CONFIG_UNEVICTABLE_LRU
87/*
88 * Called only in fault path via page_evictable() for a new page
89 * to determine if it's being mapped into a LOCKED vma.
90 * If so, mark page as mlocked.
91 */
92static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
93{
94 VM_BUG_ON(PageLRU(page));
95
96 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
97 return 0;
98
99 SetPageMlocked(page);
100 return 1;
101}
102
103/*
104 * must be called with vma's mmap_sem held for read, and page locked.
105 */
106extern void mlock_vma_page(struct page *page);
107
108/*
109 * Clear the page's PageMlocked(). This can be useful in a situation where
110 * we want to unconditionally remove a page from the pagecache -- e.g.,
111 * on truncation or freeing.
112 *
113 * It is legal to call this function for any page, mlocked or not.
114 * If called for a page that is still mapped by mlocked vmas, all we do
115 * is revert to lazy LRU behaviour -- semantics are not broken.
116 */
117extern void __clear_page_mlock(struct page *page);
118static inline void clear_page_mlock(struct page *page)
119{
120 if (unlikely(TestClearPageMlocked(page)))
121 __clear_page_mlock(page);
122}
123
124/*
125 * mlock_migrate_page - called only from migrate_page_copy() to
126 * migrate the Mlocked page flag
127 */
128static inline void mlock_migrate_page(struct page *newpage, struct page *page)
129{
130 if (TestClearPageMlocked(page))
131 SetPageMlocked(newpage);
132}
133
134
135#else /* CONFIG_UNEVICTABLE_LRU */
136static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
137{
138 return 0;
139}
140static inline void clear_page_mlock(struct page *page) { }
141static inline void mlock_vma_page(struct page *page) { }
142static inline void mlock_migrate_page(struct page *new, struct page *old) { }
143
144#endif /* CONFIG_UNEVICTABLE_LRU */
82 145
83/* 146/*
84 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, 147 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
@@ -148,4 +211,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
148} 211}
149#endif /* CONFIG_SPARSEMEM */ 212#endif /* CONFIG_SPARSEMEM */
150 213
214#define GUP_FLAGS_WRITE 0x1
215#define GUP_FLAGS_FORCE 0x2
216#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
217
218int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
219 unsigned long start, int len, int flags,
220 struct page **pages, struct vm_area_struct **vmas);
221
151#endif 222#endif
diff --git a/mm/memory.c b/mm/memory.c
index 71cdefd1ef14..9fef7272fb9e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -64,6 +64,8 @@
64 64
65#include "internal.h" 65#include "internal.h"
66 66
67#include "internal.h"
68
67#ifndef CONFIG_NEED_MULTIPLE_NODES 69#ifndef CONFIG_NEED_MULTIPLE_NODES
68/* use the per-pgdat data instead for discontigmem - mbligh */ 70/* use the per-pgdat data instead for discontigmem - mbligh */
69unsigned long max_mapnr; 71unsigned long max_mapnr;
@@ -1129,12 +1131,17 @@ static inline int use_zero_page(struct vm_area_struct *vma)
1129 return !vma->vm_ops || !vma->vm_ops->fault; 1131 return !vma->vm_ops || !vma->vm_ops->fault;
1130} 1132}
1131 1133
1132int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1134
1133 unsigned long start, int len, int write, int force, 1135
1136int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1137 unsigned long start, int len, int flags,
1134 struct page **pages, struct vm_area_struct **vmas) 1138 struct page **pages, struct vm_area_struct **vmas)
1135{ 1139{
1136 int i; 1140 int i;
1137 unsigned int vm_flags; 1141 unsigned int vm_flags = 0;
1142 int write = !!(flags & GUP_FLAGS_WRITE);
1143 int force = !!(flags & GUP_FLAGS_FORCE);
1144 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1138 1145
1139 if (len <= 0) 1146 if (len <= 0)
1140 return 0; 1147 return 0;
@@ -1158,7 +1165,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1158 pud_t *pud; 1165 pud_t *pud;
1159 pmd_t *pmd; 1166 pmd_t *pmd;
1160 pte_t *pte; 1167 pte_t *pte;
1161 if (write) /* user gate pages are read-only */ 1168
1169 /* user gate pages are read-only */
1170 if (!ignore && write)
1162 return i ? : -EFAULT; 1171 return i ? : -EFAULT;
1163 if (pg > TASK_SIZE) 1172 if (pg > TASK_SIZE)
1164 pgd = pgd_offset_k(pg); 1173 pgd = pgd_offset_k(pg);
@@ -1190,8 +1199,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1190 continue; 1199 continue;
1191 } 1200 }
1192 1201
1193 if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1202 if (!vma ||
1194 || !(vm_flags & vma->vm_flags)) 1203 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1204 (!ignore && !(vm_flags & vma->vm_flags)))
1195 return i ? : -EFAULT; 1205 return i ? : -EFAULT;
1196 1206
1197 if (is_vm_hugetlb_page(vma)) { 1207 if (is_vm_hugetlb_page(vma)) {
@@ -1266,6 +1276,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1266 } while (len); 1276 } while (len);
1267 return i; 1277 return i;
1268} 1278}
1279
1280int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1281 unsigned long start, int len, int write, int force,
1282 struct page **pages, struct vm_area_struct **vmas)
1283{
1284 int flags = 0;
1285
1286 if (write)
1287 flags |= GUP_FLAGS_WRITE;
1288 if (force)
1289 flags |= GUP_FLAGS_FORCE;
1290
1291 return __get_user_pages(tsk, mm,
1292 start, len, flags,
1293 pages, vmas);
1294}
1295
1269EXPORT_SYMBOL(get_user_pages); 1296EXPORT_SYMBOL(get_user_pages);
1270 1297
1271pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1298pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@ -1858,6 +1885,15 @@ gotten:
1858 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1885 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1859 if (!new_page) 1886 if (!new_page)
1860 goto oom; 1887 goto oom;
1888 /*
1889 * Don't let another task, with possibly unlocked vma,
1890 * keep the mlocked page.
1891 */
1892 if (vma->vm_flags & VM_LOCKED) {
1893 lock_page(old_page); /* for LRU manipulation */
1894 clear_page_mlock(old_page);
1895 unlock_page(old_page);
1896 }
1861 cow_user_page(new_page, old_page, address, vma); 1897 cow_user_page(new_page, old_page, address, vma);
1862 __SetPageUptodate(new_page); 1898 __SetPageUptodate(new_page);
1863 1899
@@ -2325,7 +2361,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2325 page_add_anon_rmap(page, vma, address); 2361 page_add_anon_rmap(page, vma, address);
2326 2362
2327 swap_free(entry); 2363 swap_free(entry);
2328 if (vm_swap_full()) 2364 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2329 remove_exclusive_swap_page(page); 2365 remove_exclusive_swap_page(page);
2330 unlock_page(page); 2366 unlock_page(page);
2331 2367
@@ -2465,6 +2501,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2465 ret = VM_FAULT_OOM; 2501 ret = VM_FAULT_OOM;
2466 goto out; 2502 goto out;
2467 } 2503 }
2504 /*
2505 * Don't let another task, with possibly unlocked vma,
2506 * keep the mlocked page.
2507 */
2508 if (vma->vm_flags & VM_LOCKED)
2509 clear_page_mlock(vmf.page);
2468 copy_user_highpage(page, vmf.page, address, vma); 2510 copy_user_highpage(page, vmf.page, address, vma);
2469 __SetPageUptodate(page); 2511 __SetPageUptodate(page);
2470 } else { 2512 } else {
diff --git a/mm/migrate.c b/mm/migrate.c
index b10237d8b459..6802a7a3dfec 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -371,6 +371,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
371 __set_page_dirty_nobuffers(newpage); 371 __set_page_dirty_nobuffers(newpage);
372 } 372 }
373 373
374 mlock_migrate_page(newpage, page);
375
374#ifdef CONFIG_SWAP 376#ifdef CONFIG_SWAP
375 ClearPageSwapCache(page); 377 ClearPageSwapCache(page);
376#endif 378#endif
diff --git a/mm/mlock.c b/mm/mlock.c
index 01fbe93eff5c..8746fe3f9730 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -8,10 +8,18 @@
8#include <linux/capability.h> 8#include <linux/capability.h>
9#include <linux/mman.h> 9#include <linux/mman.h>
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/swapops.h>
13#include <linux/pagemap.h>
11#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
12#include <linux/syscalls.h> 15#include <linux/syscalls.h>
13#include <linux/sched.h> 16#include <linux/sched.h>
14#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/rmap.h>
19#include <linux/mmzone.h>
20#include <linux/hugetlb.h>
21
22#include "internal.h"
15 23
16int can_do_mlock(void) 24int can_do_mlock(void)
17{ 25{
@@ -23,17 +31,360 @@ int can_do_mlock(void)
23} 31}
24EXPORT_SYMBOL(can_do_mlock); 32EXPORT_SYMBOL(can_do_mlock);
25 33
34#ifdef CONFIG_UNEVICTABLE_LRU
35/*
36 * Mlocked pages are marked with PageMlocked() flag for efficient testing
37 * in vmscan and, possibly, the fault path; and to support semi-accurate
38 * statistics.
39 *
40 * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
41 * be placed on the LRU "unevictable" list, rather than the [in]active lists.
42 * The unevictable list is an LRU sibling list to the [in]active lists.
43 * PageUnevictable is set to indicate the unevictable state.
44 *
45 * When lazy mlocking via vmscan, it is important to ensure that the
46 * vma's VM_LOCKED status is not concurrently being modified, otherwise we
47 * may have mlocked a page that is being munlocked. So lazy mlock must take
48 * the mmap_sem for read, and verify that the vma really is locked
49 * (see mm/rmap.c).
50 */
51
52/*
53 * LRU accounting for clear_page_mlock()
54 */
55void __clear_page_mlock(struct page *page)
56{
57 VM_BUG_ON(!PageLocked(page));
58
59 if (!page->mapping) { /* truncated ? */
60 return;
61 }
62
63 if (!isolate_lru_page(page)) {
64 putback_lru_page(page);
65 } else {
66 /*
67 * Page not on the LRU yet. Flush all pagevecs and retry.
68 */
69 lru_add_drain_all();
70 if (!isolate_lru_page(page))
71 putback_lru_page(page);
72 }
73}
74
75/*
76 * Mark page as mlocked if not already.
77 * If page on LRU, isolate and putback to move to unevictable list.
78 */
79void mlock_vma_page(struct page *page)
80{
81 BUG_ON(!PageLocked(page));
82
83 if (!TestSetPageMlocked(page) && !isolate_lru_page(page))
84 putback_lru_page(page);
85}
86
87/*
88 * called from munlock()/munmap() path with page supposedly on the LRU.
89 *
90 * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked
91 * [in try_to_munlock()] and then attempt to isolate the page. We must
92 * isolate the page to keep others from messing with its unevictable
93 * and mlocked state while trying to munlock. However, we pre-clear the
94 * mlocked state anyway as we might lose the isolation race and we might
95 * not get another chance to clear PageMlocked. If we successfully
96 * isolate the page and try_to_munlock() detects other VM_LOCKED vmas
97 * mapping the page, it will restore the PageMlocked state, unless the page
98 * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(),
99 * perhaps redundantly.
100 * If we lose the isolation race, and the page is mapped by other VM_LOCKED
101 * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
102 * either of which will restore the PageMlocked state by calling
103 * mlock_vma_page() above, if it can grab the vma's mmap sem.
104 */
105static void munlock_vma_page(struct page *page)
106{
107 BUG_ON(!PageLocked(page));
108
109 if (TestClearPageMlocked(page) && !isolate_lru_page(page)) {
110 try_to_munlock(page);
111 putback_lru_page(page);
112 }
113}
114
115/*
116 * mlock a range of pages in the vma.
117 *
118 * This takes care of making the pages present too.
119 *
120 * vma->vm_mm->mmap_sem must be held for write.
121 */
122static int __mlock_vma_pages_range(struct vm_area_struct *vma,
123 unsigned long start, unsigned long end)
124{
125 struct mm_struct *mm = vma->vm_mm;
126 unsigned long addr = start;
127 struct page *pages[16]; /* 16 gives a reasonable batch */
128 int write = !!(vma->vm_flags & VM_WRITE);
129 int nr_pages = (end - start) / PAGE_SIZE;
130 int ret;
131
132 VM_BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK);
133 VM_BUG_ON(start < vma->vm_start || end > vma->vm_end);
134 VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
135
136 lru_add_drain_all(); /* push cached pages to LRU */
137
138 while (nr_pages > 0) {
139 int i;
140
141 cond_resched();
142
143 /*
144 * get_user_pages makes pages present if we are
145 * setting mlock. and this extra reference count will
146 * disable migration of this page. However, page may
147 * still be truncated out from under us.
148 */
149 ret = get_user_pages(current, mm, addr,
150 min_t(int, nr_pages, ARRAY_SIZE(pages)),
151 write, 0, pages, NULL);
152 /*
153 * This can happen for, e.g., VM_NONLINEAR regions before
154 * a page has been allocated and mapped at a given offset,
155 * or for addresses that map beyond end of a file.
156 * We'll mlock the the pages if/when they get faulted in.
157 */
158 if (ret < 0)
159 break;
160 if (ret == 0) {
161 /*
162 * We know the vma is there, so the only time
163 * we cannot get a single page should be an
164 * error (ret < 0) case.
165 */
166 WARN_ON(1);
167 break;
168 }
169
170 lru_add_drain(); /* push cached pages to LRU */
171
172 for (i = 0; i < ret; i++) {
173 struct page *page = pages[i];
174
175 lock_page(page);
176 /*
177 * Because we lock page here and migration is blocked
178 * by the elevated reference, we need only check for
179 * page truncation (file-cache only).
180 */
181 if (page->mapping)
182 mlock_vma_page(page);
183 unlock_page(page);
184 put_page(page); /* ref from get_user_pages() */
185
186 /*
187 * here we assume that get_user_pages() has given us
188 * a list of virtually contiguous pages.
189 */
190 addr += PAGE_SIZE; /* for next get_user_pages() */
191 nr_pages--;
192 }
193 }
194
195 lru_add_drain_all(); /* to update stats */
196
197 return 0; /* count entire vma as locked_vm */
198}
199
200/*
201 * private structure for munlock page table walk
202 */
203struct munlock_page_walk {
204 struct vm_area_struct *vma;
205 pmd_t *pmd; /* for migration_entry_wait() */
206};
207
208/*
209 * munlock normal pages for present ptes
210 */
211static int __munlock_pte_handler(pte_t *ptep, unsigned long addr,
212 unsigned long end, struct mm_walk *walk)
213{
214 struct munlock_page_walk *mpw = walk->private;
215 swp_entry_t entry;
216 struct page *page;
217 pte_t pte;
218
219retry:
220 pte = *ptep;
221 /*
222 * If it's a swap pte, we might be racing with page migration.
223 */
224 if (unlikely(!pte_present(pte))) {
225 if (!is_swap_pte(pte))
226 goto out;
227 entry = pte_to_swp_entry(pte);
228 if (is_migration_entry(entry)) {
229 migration_entry_wait(mpw->vma->vm_mm, mpw->pmd, addr);
230 goto retry;
231 }
232 goto out;
233 }
234
235 page = vm_normal_page(mpw->vma, addr, pte);
236 if (!page)
237 goto out;
238
239 lock_page(page);
240 if (!page->mapping) {
241 unlock_page(page);
242 goto retry;
243 }
244 munlock_vma_page(page);
245 unlock_page(page);
246
247out:
248 return 0;
249}
250
251/*
252 * Save pmd for pte handler for waiting on migration entries
253 */
254static int __munlock_pmd_handler(pmd_t *pmd, unsigned long addr,
255 unsigned long end, struct mm_walk *walk)
256{
257 struct munlock_page_walk *mpw = walk->private;
258
259 mpw->pmd = pmd;
260 return 0;
261}
262
263
264/*
265 * munlock a range of pages in the vma using standard page table walk.
266 *
267 * vma->vm_mm->mmap_sem must be held for write.
268 */
269static void __munlock_vma_pages_range(struct vm_area_struct *vma,
270 unsigned long start, unsigned long end)
271{
272 struct mm_struct *mm = vma->vm_mm;
273 struct munlock_page_walk mpw = {
274 .vma = vma,
275 };
276 struct mm_walk munlock_page_walk = {
277 .pmd_entry = __munlock_pmd_handler,
278 .pte_entry = __munlock_pte_handler,
279 .private = &mpw,
280 .mm = mm,
281 };
282
283 VM_BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK);
284 VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
285 VM_BUG_ON(start < vma->vm_start);
286 VM_BUG_ON(end > vma->vm_end);
287
288 lru_add_drain_all(); /* push cached pages to LRU */
289 walk_page_range(start, end, &munlock_page_walk);
290 lru_add_drain_all(); /* to update stats */
291}
292
293#else /* CONFIG_UNEVICTABLE_LRU */
294
295/*
296 * Just make pages present if VM_LOCKED. No-op if unlocking.
297 */
298static int __mlock_vma_pages_range(struct vm_area_struct *vma,
299 unsigned long start, unsigned long end)
300{
301 if (vma->vm_flags & VM_LOCKED)
302 make_pages_present(start, end);
303 return 0;
304}
305
306/*
307 * munlock a range of pages in the vma -- no-op.
308 */
309static void __munlock_vma_pages_range(struct vm_area_struct *vma,
310 unsigned long start, unsigned long end)
311{
312}
313#endif /* CONFIG_UNEVICTABLE_LRU */
314
315/*
316 * mlock all pages in this vma range. For mmap()/mremap()/...
317 */
318int mlock_vma_pages_range(struct vm_area_struct *vma,
319 unsigned long start, unsigned long end)
320{
321 int nr_pages = (end - start) / PAGE_SIZE;
322 BUG_ON(!(vma->vm_flags & VM_LOCKED));
323
324 /*
325 * filter unlockable vmas
326 */
327 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
328 goto no_mlock;
329
330 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
331 is_vm_hugetlb_page(vma) ||
332 vma == get_gate_vma(current)))
333 return __mlock_vma_pages_range(vma, start, end);
334
335 /*
336 * User mapped kernel pages or huge pages:
337 * make these pages present to populate the ptes, but
338 * fall thru' to reset VM_LOCKED--no need to unlock, and
339 * return nr_pages so these don't get counted against task's
340 * locked limit. huge pages are already counted against
341 * locked vm limit.
342 */
343 make_pages_present(start, end);
344
345no_mlock:
346 vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
347 return nr_pages; /* pages NOT mlocked */
348}
349
350
351/*
352 * munlock all pages in vma. For munmap() and exit().
353 */
354void munlock_vma_pages_all(struct vm_area_struct *vma)
355{
356 vma->vm_flags &= ~VM_LOCKED;
357 __munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
358}
359
360/*
361 * mlock_fixup - handle mlock[all]/munlock[all] requests.
362 *
363 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
364 * munlock is a no-op. However, for some special vmas, we go ahead and
365 * populate the ptes via make_pages_present().
366 *
367 * For vmas that pass the filters, merge/split as appropriate.
368 */
26static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, 369static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
27 unsigned long start, unsigned long end, unsigned int newflags) 370 unsigned long start, unsigned long end, unsigned int newflags)
28{ 371{
29 struct mm_struct * mm = vma->vm_mm; 372 struct mm_struct *mm = vma->vm_mm;
30 pgoff_t pgoff; 373 pgoff_t pgoff;
31 int pages; 374 int nr_pages;
32 int ret = 0; 375 int ret = 0;
33 376 int lock = newflags & VM_LOCKED;
34 if (newflags == vma->vm_flags) { 377
35 *prev = vma; 378 if (newflags == vma->vm_flags ||
36 goto out; 379 (vma->vm_flags & (VM_IO | VM_PFNMAP)))
380 goto out; /* don't set VM_LOCKED, don't count */
381
382 if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
383 is_vm_hugetlb_page(vma) ||
384 vma == get_gate_vma(current)) {
385 if (lock)
386 make_pages_present(start, end);
387 goto out; /* don't set VM_LOCKED, don't count */
37 } 388 }
38 389
39 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 390 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -44,8 +395,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
44 goto success; 395 goto success;
45 } 396 }
46 397
47 *prev = vma;
48
49 if (start != vma->vm_start) { 398 if (start != vma->vm_start) {
50 ret = split_vma(mm, vma, start, 1); 399 ret = split_vma(mm, vma, start, 1);
51 if (ret) 400 if (ret)
@@ -60,24 +409,31 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
60 409
61success: 410success:
62 /* 411 /*
412 * Keep track of amount of locked VM.
413 */
414 nr_pages = (end - start) >> PAGE_SHIFT;
415 if (!lock)
416 nr_pages = -nr_pages;
417 mm->locked_vm += nr_pages;
418
419 /*
63 * vm_flags is protected by the mmap_sem held in write mode. 420 * vm_flags is protected by the mmap_sem held in write mode.
64 * It's okay if try_to_unmap_one unmaps a page just after we 421 * It's okay if try_to_unmap_one unmaps a page just after we
65 * set VM_LOCKED, make_pages_present below will bring it back. 422 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
66 */ 423 */
67 vma->vm_flags = newflags; 424 vma->vm_flags = newflags;
68 425
69 /* 426 if (lock) {
70 * Keep track of amount of locked VM. 427 ret = __mlock_vma_pages_range(vma, start, end);
71 */ 428 if (ret > 0) {
72 pages = (end - start) >> PAGE_SHIFT; 429 mm->locked_vm -= ret;
73 if (newflags & VM_LOCKED) { 430 ret = 0;
74 pages = -pages; 431 }
75 if (!(newflags & VM_IO)) 432 } else
76 ret = make_pages_present(start, end); 433 __munlock_vma_pages_range(vma, start, end);
77 }
78 434
79 mm->locked_vm -= pages;
80out: 435out:
436 *prev = vma;
81 return ret; 437 return ret;
82} 438}
83 439
diff --git a/mm/mmap.c b/mm/mmap.c
index e7a5a68a9c2e..7bdfd2661f17 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -662,8 +662,6 @@ again: remove_next = 1 + (end > next->vm_end);
662 * If the vma has a ->close operation then the driver probably needs to release 662 * If the vma has a ->close operation then the driver probably needs to release
663 * per-vma resources, so we don't attempt to merge those. 663 * per-vma resources, so we don't attempt to merge those.
664 */ 664 */
665#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
666
667static inline int is_mergeable_vma(struct vm_area_struct *vma, 665static inline int is_mergeable_vma(struct vm_area_struct *vma,
668 struct file *file, unsigned long vm_flags) 666 struct file *file, unsigned long vm_flags)
669{ 667{
diff --git a/mm/nommu.c b/mm/nommu.c
index ed75bc962fbe..2696b24f2bb3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -34,6 +34,8 @@
34#include <asm/tlb.h> 34#include <asm/tlb.h>
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36 36
37#include "internal.h"
38
37void *high_memory; 39void *high_memory;
38struct page *mem_map; 40struct page *mem_map;
39unsigned long max_mapnr; 41unsigned long max_mapnr;
@@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp)
128 return PAGE_SIZE << compound_order(page); 130 return PAGE_SIZE << compound_order(page);
129} 131}
130 132
131/* 133int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
132 * get a list of pages in an address range belonging to the specified process 134 unsigned long start, int len, int flags,
133 * and indicate the VMA that covers each page 135 struct page **pages, struct vm_area_struct **vmas)
134 * - this is potentially dodgy as we may end incrementing the page count of a
135 * slab page or a secondary page from a compound page
136 * - don't permit access to VMAs that don't support it, such as I/O mappings
137 */
138int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
139 unsigned long start, int len, int write, int force,
140 struct page **pages, struct vm_area_struct **vmas)
141{ 136{
142 struct vm_area_struct *vma; 137 struct vm_area_struct *vma;
143 unsigned long vm_flags; 138 unsigned long vm_flags;
144 int i; 139 int i;
140 int write = !!(flags & GUP_FLAGS_WRITE);
141 int force = !!(flags & GUP_FLAGS_FORCE);
142 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
145 143
146 /* calculate required read or write permissions. 144 /* calculate required read or write permissions.
147 * - if 'force' is set, we only require the "MAY" flags. 145 * - if 'force' is set, we only require the "MAY" flags.
@@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
156 154
157 /* protect what we can, including chardevs */ 155 /* protect what we can, including chardevs */
158 if (vma->vm_flags & (VM_IO | VM_PFNMAP) || 156 if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
159 !(vm_flags & vma->vm_flags)) 157 (!ignore && !(vm_flags & vma->vm_flags)))
160 goto finish_or_fault; 158 goto finish_or_fault;
161 159
162 if (pages) { 160 if (pages) {
@@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
174finish_or_fault: 172finish_or_fault:
175 return i ? : -EFAULT; 173 return i ? : -EFAULT;
176} 174}
175
176
177/*
178 * get a list of pages in an address range belonging to the specified process
179 * and indicate the VMA that covers each page
180 * - this is potentially dodgy as we may end incrementing the page count of a
181 * slab page or a secondary page from a compound page
182 * - don't permit access to VMAs that don't support it, such as I/O mappings
183 */
184int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
185 unsigned long start, int len, int write, int force,
186 struct page **pages, struct vm_area_struct **vmas)
187{
188 int flags = 0;
189
190 if (write)
191 flags |= GUP_FLAGS_WRITE;
192 if (force)
193 flags |= GUP_FLAGS_FORCE;
194
195 return __get_user_pages(tsk, mm,
196 start, len, flags,
197 pages, vmas);
198}
177EXPORT_SYMBOL(get_user_pages); 199EXPORT_SYMBOL(get_user_pages);
178 200
179DEFINE_RWLOCK(vmlist_lock); 201DEFINE_RWLOCK(vmlist_lock);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4125230a1b2c..5886586fde6c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -616,7 +616,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
616 616
617 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | 617 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
618 1 << PG_referenced | 1 << PG_arch_1 | 618 1 << PG_referenced | 1 << PG_arch_1 |
619 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); 619 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
620#ifdef CONFIG_UNEVICTABLE_LRU
621 | 1 << PG_mlocked
622#endif
623 );
620 set_page_private(page, 0); 624 set_page_private(page, 0);
621 set_page_refcounted(page); 625 set_page_refcounted(page);
622 626
diff --git a/mm/rmap.c b/mm/rmap.c
index e8d639b16c6d..7e60df99018e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -53,6 +53,8 @@
53 53
54#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
55 55
56#include "internal.h"
57
56struct kmem_cache *anon_vma_cachep; 58struct kmem_cache *anon_vma_cachep;
57 59
58/** 60/**
@@ -290,6 +292,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
290 return NULL; 292 return NULL;
291} 293}
292 294
295/**
296 * page_mapped_in_vma - check whether a page is really mapped in a VMA
297 * @page: the page to test
298 * @vma: the VMA to test
299 *
300 * Returns 1 if the page is mapped into the page tables of the VMA, 0
301 * if the page is not mapped into the page tables of this VMA. Only
302 * valid for normal file or anonymous VMAs.
303 */
304static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
305{
306 unsigned long address;
307 pte_t *pte;
308 spinlock_t *ptl;
309
310 address = vma_address(page, vma);
311 if (address == -EFAULT) /* out of vma range */
312 return 0;
313 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
314 if (!pte) /* the page is not in this mm */
315 return 0;
316 pte_unmap_unlock(pte, ptl);
317
318 return 1;
319}
320
293/* 321/*
294 * Subfunctions of page_referenced: page_referenced_one called 322 * Subfunctions of page_referenced: page_referenced_one called
295 * repeatedly from either page_referenced_anon or page_referenced_file. 323 * repeatedly from either page_referenced_anon or page_referenced_file.
@@ -311,10 +339,17 @@ static int page_referenced_one(struct page *page,
311 if (!pte) 339 if (!pte)
312 goto out; 340 goto out;
313 341
342 /*
343 * Don't want to elevate referenced for mlocked page that gets this far,
344 * in order that it progresses to try_to_unmap and is moved to the
345 * unevictable list.
346 */
314 if (vma->vm_flags & VM_LOCKED) { 347 if (vma->vm_flags & VM_LOCKED) {
315 referenced++;
316 *mapcount = 1; /* break early from loop */ 348 *mapcount = 1; /* break early from loop */
317 } else if (ptep_clear_flush_young_notify(vma, address, pte)) 349 goto out_unmap;
350 }
351
352 if (ptep_clear_flush_young_notify(vma, address, pte))
318 referenced++; 353 referenced++;
319 354
320 /* Pretend the page is referenced if the task has the 355 /* Pretend the page is referenced if the task has the
@@ -323,6 +358,7 @@ static int page_referenced_one(struct page *page,
323 rwsem_is_locked(&mm->mmap_sem)) 358 rwsem_is_locked(&mm->mmap_sem))
324 referenced++; 359 referenced++;
325 360
361out_unmap:
326 (*mapcount)--; 362 (*mapcount)--;
327 pte_unmap_unlock(pte, ptl); 363 pte_unmap_unlock(pte, ptl);
328out: 364out:
@@ -412,11 +448,6 @@ static int page_referenced_file(struct page *page,
412 */ 448 */
413 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 449 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
414 continue; 450 continue;
415 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
416 == (VM_LOCKED|VM_MAYSHARE)) {
417 referenced++;
418 break;
419 }
420 referenced += page_referenced_one(page, vma, &mapcount); 451 referenced += page_referenced_one(page, vma, &mapcount);
421 if (!mapcount) 452 if (!mapcount)
422 break; 453 break;
@@ -739,11 +770,16 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
739 * If it's recently referenced (perhaps page_referenced 770 * If it's recently referenced (perhaps page_referenced
740 * skipped over this mm) then we should reactivate it. 771 * skipped over this mm) then we should reactivate it.
741 */ 772 */
742 if (!migration && ((vma->vm_flags & VM_LOCKED) || 773 if (!migration) {
743 (ptep_clear_flush_young_notify(vma, address, pte)))) { 774 if (vma->vm_flags & VM_LOCKED) {
744 ret = SWAP_FAIL; 775 ret = SWAP_MLOCK;
745 goto out_unmap; 776 goto out_unmap;
746 } 777 }
778 if (ptep_clear_flush_young_notify(vma, address, pte)) {
779 ret = SWAP_FAIL;
780 goto out_unmap;
781 }
782 }
747 783
748 /* Nuke the page table entry. */ 784 /* Nuke the page table entry. */
749 flush_cache_page(vma, address, page_to_pfn(page)); 785 flush_cache_page(vma, address, page_to_pfn(page));
@@ -824,12 +860,17 @@ out:
824 * For very sparsely populated VMAs this is a little inefficient - chances are 860 * For very sparsely populated VMAs this is a little inefficient - chances are
825 * there there won't be many ptes located within the scan cluster. In this case 861 * there there won't be many ptes located within the scan cluster. In this case
826 * maybe we could scan further - to the end of the pte page, perhaps. 862 * maybe we could scan further - to the end of the pte page, perhaps.
863 *
864 * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
865 * acquire it without blocking. If vma locked, mlock the pages in the cluster,
866 * rather than unmapping them. If we encounter the "check_page" that vmscan is
867 * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
827 */ 868 */
828#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 869#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
829#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 870#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
830 871
831static void try_to_unmap_cluster(unsigned long cursor, 872static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
832 unsigned int *mapcount, struct vm_area_struct *vma) 873 struct vm_area_struct *vma, struct page *check_page)
833{ 874{
834 struct mm_struct *mm = vma->vm_mm; 875 struct mm_struct *mm = vma->vm_mm;
835 pgd_t *pgd; 876 pgd_t *pgd;
@@ -841,6 +882,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
841 struct page *page; 882 struct page *page;
842 unsigned long address; 883 unsigned long address;
843 unsigned long end; 884 unsigned long end;
885 int ret = SWAP_AGAIN;
886 int locked_vma = 0;
844 887
845 address = (vma->vm_start + cursor) & CLUSTER_MASK; 888 address = (vma->vm_start + cursor) & CLUSTER_MASK;
846 end = address + CLUSTER_SIZE; 889 end = address + CLUSTER_SIZE;
@@ -851,15 +894,26 @@ static void try_to_unmap_cluster(unsigned long cursor,
851 894
852 pgd = pgd_offset(mm, address); 895 pgd = pgd_offset(mm, address);
853 if (!pgd_present(*pgd)) 896 if (!pgd_present(*pgd))
854 return; 897 return ret;
855 898
856 pud = pud_offset(pgd, address); 899 pud = pud_offset(pgd, address);
857 if (!pud_present(*pud)) 900 if (!pud_present(*pud))
858 return; 901 return ret;
859 902
860 pmd = pmd_offset(pud, address); 903 pmd = pmd_offset(pud, address);
861 if (!pmd_present(*pmd)) 904 if (!pmd_present(*pmd))
862 return; 905 return ret;
906
907 /*
908 * MLOCK_PAGES => feature is configured.
909 * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
910 * keep the sem while scanning the cluster for mlocking pages.
911 */
912 if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
913 locked_vma = (vma->vm_flags & VM_LOCKED);
914 if (!locked_vma)
915 up_read(&vma->vm_mm->mmap_sem); /* don't need it */
916 }
863 917
864 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 918 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
865 919
@@ -872,6 +926,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
872 page = vm_normal_page(vma, address, *pte); 926 page = vm_normal_page(vma, address, *pte);
873 BUG_ON(!page || PageAnon(page)); 927 BUG_ON(!page || PageAnon(page));
874 928
929 if (locked_vma) {
930 mlock_vma_page(page); /* no-op if already mlocked */
931 if (page == check_page)
932 ret = SWAP_MLOCK;
933 continue; /* don't unmap */
934 }
935
875 if (ptep_clear_flush_young_notify(vma, address, pte)) 936 if (ptep_clear_flush_young_notify(vma, address, pte))
876 continue; 937 continue;
877 938
@@ -893,39 +954,104 @@ static void try_to_unmap_cluster(unsigned long cursor,
893 (*mapcount)--; 954 (*mapcount)--;
894 } 955 }
895 pte_unmap_unlock(pte - 1, ptl); 956 pte_unmap_unlock(pte - 1, ptl);
957 if (locked_vma)
958 up_read(&vma->vm_mm->mmap_sem);
959 return ret;
896} 960}
897 961
898static int try_to_unmap_anon(struct page *page, int migration) 962/*
963 * common handling for pages mapped in VM_LOCKED vmas
964 */
965static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
966{
967 int mlocked = 0;
968
969 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
970 if (vma->vm_flags & VM_LOCKED) {
971 mlock_vma_page(page);
972 mlocked++; /* really mlocked the page */
973 }
974 up_read(&vma->vm_mm->mmap_sem);
975 }
976 return mlocked;
977}
978
979/**
980 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
981 * rmap method
982 * @page: the page to unmap/unlock
983 * @unlock: request for unlock rather than unmap [unlikely]
984 * @migration: unmapping for migration - ignored if @unlock
985 *
986 * Find all the mappings of a page using the mapping pointer and the vma chains
987 * contained in the anon_vma struct it points to.
988 *
989 * This function is only called from try_to_unmap/try_to_munlock for
990 * anonymous pages.
991 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
992 * where the page was found will be held for write. So, we won't recheck
993 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
994 * 'LOCKED.
995 */
996static int try_to_unmap_anon(struct page *page, int unlock, int migration)
899{ 997{
900 struct anon_vma *anon_vma; 998 struct anon_vma *anon_vma;
901 struct vm_area_struct *vma; 999 struct vm_area_struct *vma;
1000 unsigned int mlocked = 0;
902 int ret = SWAP_AGAIN; 1001 int ret = SWAP_AGAIN;
903 1002
1003 if (MLOCK_PAGES && unlikely(unlock))
1004 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
1005
904 anon_vma = page_lock_anon_vma(page); 1006 anon_vma = page_lock_anon_vma(page);
905 if (!anon_vma) 1007 if (!anon_vma)
906 return ret; 1008 return ret;
907 1009
908 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1010 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
909 ret = try_to_unmap_one(page, vma, migration); 1011 if (MLOCK_PAGES && unlikely(unlock)) {
910 if (ret == SWAP_FAIL || !page_mapped(page)) 1012 if (!((vma->vm_flags & VM_LOCKED) &&
911 break; 1013 page_mapped_in_vma(page, vma)))
1014 continue; /* must visit all unlocked vmas */
1015 ret = SWAP_MLOCK; /* saw at least one mlocked vma */
1016 } else {
1017 ret = try_to_unmap_one(page, vma, migration);
1018 if (ret == SWAP_FAIL || !page_mapped(page))
1019 break;
1020 }
1021 if (ret == SWAP_MLOCK) {
1022 mlocked = try_to_mlock_page(page, vma);
1023 if (mlocked)
1024 break; /* stop if actually mlocked page */
1025 }
912 } 1026 }
913 1027
914 page_unlock_anon_vma(anon_vma); 1028 page_unlock_anon_vma(anon_vma);
1029
1030 if (mlocked)
1031 ret = SWAP_MLOCK; /* actually mlocked the page */
1032 else if (ret == SWAP_MLOCK)
1033 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
1034
915 return ret; 1035 return ret;
916} 1036}
917 1037
918/** 1038/**
919 * try_to_unmap_file - unmap file page using the object-based rmap method 1039 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
920 * @page: the page to unmap 1040 * @page: the page to unmap/unlock
921 * @migration: migration flag 1041 * @unlock: request for unlock rather than unmap [unlikely]
1042 * @migration: unmapping for migration - ignored if @unlock
922 * 1043 *
923 * Find all the mappings of a page using the mapping pointer and the vma chains 1044 * Find all the mappings of a page using the mapping pointer and the vma chains
924 * contained in the address_space struct it points to. 1045 * contained in the address_space struct it points to.
925 * 1046 *
926 * This function is only called from try_to_unmap for object-based pages. 1047 * This function is only called from try_to_unmap/try_to_munlock for
1048 * object-based pages.
1049 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1050 * where the page was found will be held for write. So, we won't recheck
1051 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1052 * 'LOCKED.
927 */ 1053 */
928static int try_to_unmap_file(struct page *page, int migration) 1054static int try_to_unmap_file(struct page *page, int unlock, int migration)
929{ 1055{
930 struct address_space *mapping = page->mapping; 1056 struct address_space *mapping = page->mapping;
931 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1057 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -936,20 +1062,44 @@ static int try_to_unmap_file(struct page *page, int migration)
936 unsigned long max_nl_cursor = 0; 1062 unsigned long max_nl_cursor = 0;
937 unsigned long max_nl_size = 0; 1063 unsigned long max_nl_size = 0;
938 unsigned int mapcount; 1064 unsigned int mapcount;
1065 unsigned int mlocked = 0;
1066
1067 if (MLOCK_PAGES && unlikely(unlock))
1068 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
939 1069
940 spin_lock(&mapping->i_mmap_lock); 1070 spin_lock(&mapping->i_mmap_lock);
941 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1071 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
942 ret = try_to_unmap_one(page, vma, migration); 1072 if (MLOCK_PAGES && unlikely(unlock)) {
943 if (ret == SWAP_FAIL || !page_mapped(page)) 1073 if (!(vma->vm_flags & VM_LOCKED))
944 goto out; 1074 continue; /* must visit all vmas */
1075 ret = SWAP_MLOCK;
1076 } else {
1077 ret = try_to_unmap_one(page, vma, migration);
1078 if (ret == SWAP_FAIL || !page_mapped(page))
1079 goto out;
1080 }
1081 if (ret == SWAP_MLOCK) {
1082 mlocked = try_to_mlock_page(page, vma);
1083 if (mlocked)
1084 break; /* stop if actually mlocked page */
1085 }
945 } 1086 }
946 1087
1088 if (mlocked)
1089 goto out;
1090
947 if (list_empty(&mapping->i_mmap_nonlinear)) 1091 if (list_empty(&mapping->i_mmap_nonlinear))
948 goto out; 1092 goto out;
949 1093
950 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1094 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
951 shared.vm_set.list) { 1095 shared.vm_set.list) {
952 if ((vma->vm_flags & VM_LOCKED) && !migration) 1096 if (MLOCK_PAGES && unlikely(unlock)) {
1097 if (!(vma->vm_flags & VM_LOCKED))
1098 continue; /* must visit all vmas */
1099 ret = SWAP_MLOCK; /* leave mlocked == 0 */
1100 goto out; /* no need to look further */
1101 }
1102 if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
953 continue; 1103 continue;
954 cursor = (unsigned long) vma->vm_private_data; 1104 cursor = (unsigned long) vma->vm_private_data;
955 if (cursor > max_nl_cursor) 1105 if (cursor > max_nl_cursor)
@@ -959,7 +1109,7 @@ static int try_to_unmap_file(struct page *page, int migration)
959 max_nl_size = cursor; 1109 max_nl_size = cursor;
960 } 1110 }
961 1111
962 if (max_nl_size == 0) { /* any nonlinears locked or reserved */ 1112 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
963 ret = SWAP_FAIL; 1113 ret = SWAP_FAIL;
964 goto out; 1114 goto out;
965 } 1115 }
@@ -983,12 +1133,16 @@ static int try_to_unmap_file(struct page *page, int migration)
983 do { 1133 do {
984 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1134 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
985 shared.vm_set.list) { 1135 shared.vm_set.list) {
986 if ((vma->vm_flags & VM_LOCKED) && !migration) 1136 if (!MLOCK_PAGES && !migration &&
1137 (vma->vm_flags & VM_LOCKED))
987 continue; 1138 continue;
988 cursor = (unsigned long) vma->vm_private_data; 1139 cursor = (unsigned long) vma->vm_private_data;
989 while ( cursor < max_nl_cursor && 1140 while ( cursor < max_nl_cursor &&
990 cursor < vma->vm_end - vma->vm_start) { 1141 cursor < vma->vm_end - vma->vm_start) {
991 try_to_unmap_cluster(cursor, &mapcount, vma); 1142 ret = try_to_unmap_cluster(cursor, &mapcount,
1143 vma, page);
1144 if (ret == SWAP_MLOCK)
1145 mlocked = 2; /* to return below */
992 cursor += CLUSTER_SIZE; 1146 cursor += CLUSTER_SIZE;
993 vma->vm_private_data = (void *) cursor; 1147 vma->vm_private_data = (void *) cursor;
994 if ((int)mapcount <= 0) 1148 if ((int)mapcount <= 0)
@@ -1009,6 +1163,10 @@ static int try_to_unmap_file(struct page *page, int migration)
1009 vma->vm_private_data = NULL; 1163 vma->vm_private_data = NULL;
1010out: 1164out:
1011 spin_unlock(&mapping->i_mmap_lock); 1165 spin_unlock(&mapping->i_mmap_lock);
1166 if (mlocked)
1167 ret = SWAP_MLOCK; /* actually mlocked the page */
1168 else if (ret == SWAP_MLOCK)
1169 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
1012 return ret; 1170 return ret;
1013} 1171}
1014 1172
@@ -1024,6 +1182,7 @@ out:
1024 * SWAP_SUCCESS - we succeeded in removing all mappings 1182 * SWAP_SUCCESS - we succeeded in removing all mappings
1025 * SWAP_AGAIN - we missed a mapping, try again later 1183 * SWAP_AGAIN - we missed a mapping, try again later
1026 * SWAP_FAIL - the page is unswappable 1184 * SWAP_FAIL - the page is unswappable
1185 * SWAP_MLOCK - page is mlocked.
1027 */ 1186 */
1028int try_to_unmap(struct page *page, int migration) 1187int try_to_unmap(struct page *page, int migration)
1029{ 1188{
@@ -1032,12 +1191,36 @@ int try_to_unmap(struct page *page, int migration)
1032 BUG_ON(!PageLocked(page)); 1191 BUG_ON(!PageLocked(page));
1033 1192
1034 if (PageAnon(page)) 1193 if (PageAnon(page))
1035 ret = try_to_unmap_anon(page, migration); 1194 ret = try_to_unmap_anon(page, 0, migration);
1036 else 1195 else
1037 ret = try_to_unmap_file(page, migration); 1196 ret = try_to_unmap_file(page, 0, migration);
1038 1197 if (ret != SWAP_MLOCK && !page_mapped(page))
1039 if (!page_mapped(page))
1040 ret = SWAP_SUCCESS; 1198 ret = SWAP_SUCCESS;
1041 return ret; 1199 return ret;
1042} 1200}
1043 1201
1202#ifdef CONFIG_UNEVICTABLE_LRU
1203/**
1204 * try_to_munlock - try to munlock a page
1205 * @page: the page to be munlocked
1206 *
1207 * Called from munlock code. Checks all of the VMAs mapping the page
1208 * to make sure nobody else has this page mlocked. The page will be
1209 * returned with PG_mlocked cleared if no other vmas have it mlocked.
1210 *
1211 * Return values are:
1212 *
1213 * SWAP_SUCCESS - no vma's holding page mlocked.
1214 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
1215 * SWAP_MLOCK - page is now mlocked.
1216 */
1217int try_to_munlock(struct page *page)
1218{
1219 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1220
1221 if (PageAnon(page))
1222 return try_to_unmap_anon(page, 1, 0);
1223 else
1224 return try_to_unmap_file(page, 1, 0);
1225}
1226#endif
diff --git a/mm/swap.c b/mm/swap.c
index fee6b973f143..bc58c1369dd6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -278,7 +278,7 @@ void lru_add_drain(void)
278 put_cpu(); 278 put_cpu();
279} 279}
280 280
281#ifdef CONFIG_NUMA 281#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU)
282static void lru_add_drain_per_cpu(struct work_struct *dummy) 282static void lru_add_drain_per_cpu(struct work_struct *dummy)
283{ 283{
284 lru_add_drain(); 284 lru_add_drain();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dfb342e0db9b..e5aaaad159ef 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -582,11 +582,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
582 582
583 sc->nr_scanned++; 583 sc->nr_scanned++;
584 584
585 if (unlikely(!page_evictable(page, NULL))) { 585 if (unlikely(!page_evictable(page, NULL)))
586 unlock_page(page); 586 goto cull_mlocked;
587 putback_lru_page(page);
588 continue;
589 }
590 587
591 if (!sc->may_swap && page_mapped(page)) 588 if (!sc->may_swap && page_mapped(page))
592 goto keep_locked; 589 goto keep_locked;
@@ -624,9 +621,19 @@ static unsigned long shrink_page_list(struct list_head *page_list,
624 * Anonymous process memory has backing store? 621 * Anonymous process memory has backing store?
625 * Try to allocate it some swap space here. 622 * Try to allocate it some swap space here.
626 */ 623 */
627 if (PageAnon(page) && !PageSwapCache(page)) 624 if (PageAnon(page) && !PageSwapCache(page)) {
625 switch (try_to_munlock(page)) {
626 case SWAP_FAIL: /* shouldn't happen */
627 case SWAP_AGAIN:
628 goto keep_locked;
629 case SWAP_MLOCK:
630 goto cull_mlocked;
631 case SWAP_SUCCESS:
632 ; /* fall thru'; add to swap cache */
633 }
628 if (!add_to_swap(page, GFP_ATOMIC)) 634 if (!add_to_swap(page, GFP_ATOMIC))
629 goto activate_locked; 635 goto activate_locked;
636 }
630#endif /* CONFIG_SWAP */ 637#endif /* CONFIG_SWAP */
631 638
632 mapping = page_mapping(page); 639 mapping = page_mapping(page);
@@ -641,6 +648,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
641 goto activate_locked; 648 goto activate_locked;
642 case SWAP_AGAIN: 649 case SWAP_AGAIN:
643 goto keep_locked; 650 goto keep_locked;
651 case SWAP_MLOCK:
652 goto cull_mlocked;
644 case SWAP_SUCCESS: 653 case SWAP_SUCCESS:
645 ; /* try to free the page below */ 654 ; /* try to free the page below */
646 } 655 }
@@ -731,6 +740,11 @@ free_it:
731 } 740 }
732 continue; 741 continue;
733 742
743cull_mlocked:
744 unlock_page(page);
745 putback_lru_page(page);
746 continue;
747
734activate_locked: 748activate_locked:
735 /* Not a candidate for swapping, so reclaim swap space. */ 749 /* Not a candidate for swapping, so reclaim swap space. */
736 if (PageSwapCache(page) && vm_swap_full()) 750 if (PageSwapCache(page) && vm_swap_full())
@@ -742,7 +756,7 @@ keep_locked:
742 unlock_page(page); 756 unlock_page(page);
743keep: 757keep:
744 list_add(&page->lru, &ret_pages); 758 list_add(&page->lru, &ret_pages);
745 VM_BUG_ON(PageLRU(page)); 759 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
746 } 760 }
747 list_splice(&ret_pages, page_list); 761 list_splice(&ret_pages, page_list);
748 if (pagevec_count(&freed_pvec)) 762 if (pagevec_count(&freed_pvec))
@@ -2329,12 +2343,13 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2329 * @vma: the VMA in which the page is or will be mapped, may be NULL 2343 * @vma: the VMA in which the page is or will be mapped, may be NULL
2330 * 2344 *
2331 * Test whether page is evictable--i.e., should be placed on active/inactive 2345 * Test whether page is evictable--i.e., should be placed on active/inactive
2332 * lists vs unevictable list. 2346 * lists vs unevictable list. The vma argument is !NULL when called from the
2347 * fault path to determine how to instantate a new page.
2333 * 2348 *
2334 * Reasons page might not be evictable: 2349 * Reasons page might not be evictable:
2335 * (1) page's mapping marked unevictable 2350 * (1) page's mapping marked unevictable
2351 * (2) page is part of an mlocked VMA
2336 * 2352 *
2337 * TODO - later patches
2338 */ 2353 */
2339int page_evictable(struct page *page, struct vm_area_struct *vma) 2354int page_evictable(struct page *page, struct vm_area_struct *vma)
2340{ 2355{
@@ -2342,7 +2357,8 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
2342 if (mapping_unevictable(page_mapping(page))) 2357 if (mapping_unevictable(page_mapping(page)))
2343 return 0; 2358 return 0;
2344 2359
2345 /* TODO: test page [!]evictable conditions */ 2360 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
2361 return 0;
2346 2362
2347 return 1; 2363 return 1;
2348} 2364}