diff options
Diffstat (limited to 'mm/mlock.c')
-rw-r--r-- | mm/mlock.c | 443 |
1 files changed, 425 insertions, 18 deletions
diff --git a/mm/mlock.c b/mm/mlock.c index 01fbe93eff5c..008ea70b7afa 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -8,10 +8,18 @@ | |||
8 | #include <linux/capability.h> | 8 | #include <linux/capability.h> |
9 | #include <linux/mman.h> | 9 | #include <linux/mman.h> |
10 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
11 | #include <linux/swap.h> | ||
12 | #include <linux/swapops.h> | ||
13 | #include <linux/pagemap.h> | ||
11 | #include <linux/mempolicy.h> | 14 | #include <linux/mempolicy.h> |
12 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
13 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
14 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/rmap.h> | ||
19 | #include <linux/mmzone.h> | ||
20 | #include <linux/hugetlb.h> | ||
21 | |||
22 | #include "internal.h" | ||
15 | 23 | ||
16 | int can_do_mlock(void) | 24 | int can_do_mlock(void) |
17 | { | 25 | { |
@@ -23,17 +31,381 @@ int can_do_mlock(void) | |||
23 | } | 31 | } |
24 | EXPORT_SYMBOL(can_do_mlock); | 32 | EXPORT_SYMBOL(can_do_mlock); |
25 | 33 | ||
34 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
35 | /* | ||
36 | * Mlocked pages are marked with PageMlocked() flag for efficient testing | ||
37 | * in vmscan and, possibly, the fault path; and to support semi-accurate | ||
38 | * statistics. | ||
39 | * | ||
40 | * An mlocked page [PageMlocked(page)] is unevictable. As such, it will | ||
41 | * be placed on the LRU "unevictable" list, rather than the [in]active lists. | ||
42 | * The unevictable list is an LRU sibling list to the [in]active lists. | ||
43 | * PageUnevictable is set to indicate the unevictable state. | ||
44 | * | ||
45 | * When lazy mlocking via vmscan, it is important to ensure that the | ||
46 | * vma's VM_LOCKED status is not concurrently being modified, otherwise we | ||
47 | * may have mlocked a page that is being munlocked. So lazy mlock must take | ||
48 | * the mmap_sem for read, and verify that the vma really is locked | ||
49 | * (see mm/rmap.c). | ||
50 | */ | ||
51 | |||
52 | /* | ||
53 | * LRU accounting for clear_page_mlock() | ||
54 | */ | ||
55 | void __clear_page_mlock(struct page *page) | ||
56 | { | ||
57 | VM_BUG_ON(!PageLocked(page)); | ||
58 | |||
59 | if (!page->mapping) { /* truncated ? */ | ||
60 | return; | ||
61 | } | ||
62 | |||
63 | dec_zone_page_state(page, NR_MLOCK); | ||
64 | count_vm_event(UNEVICTABLE_PGCLEARED); | ||
65 | if (!isolate_lru_page(page)) { | ||
66 | putback_lru_page(page); | ||
67 | } else { | ||
68 | /* | ||
69 | * Page not on the LRU yet. Flush all pagevecs and retry. | ||
70 | */ | ||
71 | lru_add_drain_all(); | ||
72 | if (!isolate_lru_page(page)) | ||
73 | putback_lru_page(page); | ||
74 | else if (PageUnevictable(page)) | ||
75 | count_vm_event(UNEVICTABLE_PGSTRANDED); | ||
76 | |||
77 | } | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * Mark page as mlocked if not already. | ||
82 | * If page on LRU, isolate and putback to move to unevictable list. | ||
83 | */ | ||
84 | void mlock_vma_page(struct page *page) | ||
85 | { | ||
86 | BUG_ON(!PageLocked(page)); | ||
87 | |||
88 | if (!TestSetPageMlocked(page)) { | ||
89 | inc_zone_page_state(page, NR_MLOCK); | ||
90 | count_vm_event(UNEVICTABLE_PGMLOCKED); | ||
91 | if (!isolate_lru_page(page)) | ||
92 | putback_lru_page(page); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * called from munlock()/munmap() path with page supposedly on the LRU. | ||
98 | * | ||
99 | * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked | ||
100 | * [in try_to_munlock()] and then attempt to isolate the page. We must | ||
101 | * isolate the page to keep others from messing with its unevictable | ||
102 | * and mlocked state while trying to munlock. However, we pre-clear the | ||
103 | * mlocked state anyway as we might lose the isolation race and we might | ||
104 | * not get another chance to clear PageMlocked. If we successfully | ||
105 | * isolate the page and try_to_munlock() detects other VM_LOCKED vmas | ||
106 | * mapping the page, it will restore the PageMlocked state, unless the page | ||
107 | * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), | ||
108 | * perhaps redundantly. | ||
109 | * If we lose the isolation race, and the page is mapped by other VM_LOCKED | ||
110 | * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() | ||
111 | * either of which will restore the PageMlocked state by calling | ||
112 | * mlock_vma_page() above, if it can grab the vma's mmap sem. | ||
113 | */ | ||
114 | static void munlock_vma_page(struct page *page) | ||
115 | { | ||
116 | BUG_ON(!PageLocked(page)); | ||
117 | |||
118 | if (TestClearPageMlocked(page)) { | ||
119 | dec_zone_page_state(page, NR_MLOCK); | ||
120 | if (!isolate_lru_page(page)) { | ||
121 | int ret = try_to_munlock(page); | ||
122 | /* | ||
123 | * did try_to_unlock() succeed or punt? | ||
124 | */ | ||
125 | if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) | ||
126 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | ||
127 | |||
128 | putback_lru_page(page); | ||
129 | } else { | ||
130 | /* | ||
131 | * We lost the race. let try_to_unmap() deal | ||
132 | * with it. At least we get the page state and | ||
133 | * mlock stats right. However, page is still on | ||
134 | * the noreclaim list. We'll fix that up when | ||
135 | * the page is eventually freed or we scan the | ||
136 | * noreclaim list. | ||
137 | */ | ||
138 | if (PageUnevictable(page)) | ||
139 | count_vm_event(UNEVICTABLE_PGSTRANDED); | ||
140 | else | ||
141 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | ||
142 | } | ||
143 | } | ||
144 | } | ||
145 | |||
146 | /** | ||
147 | * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. | ||
148 | * @vma: target vma | ||
149 | * @start: start address | ||
150 | * @end: end address | ||
151 | * @mlock: 0 indicate munlock, otherwise mlock. | ||
152 | * | ||
153 | * If @mlock == 0, unlock an mlocked range; | ||
154 | * else mlock the range of pages. This takes care of making the pages present , | ||
155 | * too. | ||
156 | * | ||
157 | * return 0 on success, negative error code on error. | ||
158 | * | ||
159 | * vma->vm_mm->mmap_sem must be held for at least read. | ||
160 | */ | ||
161 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | ||
162 | unsigned long start, unsigned long end, | ||
163 | int mlock) | ||
164 | { | ||
165 | struct mm_struct *mm = vma->vm_mm; | ||
166 | unsigned long addr = start; | ||
167 | struct page *pages[16]; /* 16 gives a reasonable batch */ | ||
168 | int nr_pages = (end - start) / PAGE_SIZE; | ||
169 | int ret; | ||
170 | int gup_flags = 0; | ||
171 | |||
172 | VM_BUG_ON(start & ~PAGE_MASK); | ||
173 | VM_BUG_ON(end & ~PAGE_MASK); | ||
174 | VM_BUG_ON(start < vma->vm_start); | ||
175 | VM_BUG_ON(end > vma->vm_end); | ||
176 | VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && | ||
177 | (atomic_read(&mm->mm_users) != 0)); | ||
178 | |||
179 | /* | ||
180 | * mlock: don't page populate if page has PROT_NONE permission. | ||
181 | * munlock: the pages always do munlock althrough | ||
182 | * its has PROT_NONE permission. | ||
183 | */ | ||
184 | if (!mlock) | ||
185 | gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; | ||
186 | |||
187 | if (vma->vm_flags & VM_WRITE) | ||
188 | gup_flags |= GUP_FLAGS_WRITE; | ||
189 | |||
190 | lru_add_drain_all(); /* push cached pages to LRU */ | ||
191 | |||
192 | while (nr_pages > 0) { | ||
193 | int i; | ||
194 | |||
195 | cond_resched(); | ||
196 | |||
197 | /* | ||
198 | * get_user_pages makes pages present if we are | ||
199 | * setting mlock. and this extra reference count will | ||
200 | * disable migration of this page. However, page may | ||
201 | * still be truncated out from under us. | ||
202 | */ | ||
203 | ret = __get_user_pages(current, mm, addr, | ||
204 | min_t(int, nr_pages, ARRAY_SIZE(pages)), | ||
205 | gup_flags, pages, NULL); | ||
206 | /* | ||
207 | * This can happen for, e.g., VM_NONLINEAR regions before | ||
208 | * a page has been allocated and mapped at a given offset, | ||
209 | * or for addresses that map beyond end of a file. | ||
210 | * We'll mlock the the pages if/when they get faulted in. | ||
211 | */ | ||
212 | if (ret < 0) | ||
213 | break; | ||
214 | if (ret == 0) { | ||
215 | /* | ||
216 | * We know the vma is there, so the only time | ||
217 | * we cannot get a single page should be an | ||
218 | * error (ret < 0) case. | ||
219 | */ | ||
220 | WARN_ON(1); | ||
221 | break; | ||
222 | } | ||
223 | |||
224 | lru_add_drain(); /* push cached pages to LRU */ | ||
225 | |||
226 | for (i = 0; i < ret; i++) { | ||
227 | struct page *page = pages[i]; | ||
228 | |||
229 | lock_page(page); | ||
230 | /* | ||
231 | * Because we lock page here and migration is blocked | ||
232 | * by the elevated reference, we need only check for | ||
233 | * page truncation (file-cache only). | ||
234 | */ | ||
235 | if (page->mapping) { | ||
236 | if (mlock) | ||
237 | mlock_vma_page(page); | ||
238 | else | ||
239 | munlock_vma_page(page); | ||
240 | } | ||
241 | unlock_page(page); | ||
242 | put_page(page); /* ref from get_user_pages() */ | ||
243 | |||
244 | /* | ||
245 | * here we assume that get_user_pages() has given us | ||
246 | * a list of virtually contiguous pages. | ||
247 | */ | ||
248 | addr += PAGE_SIZE; /* for next get_user_pages() */ | ||
249 | nr_pages--; | ||
250 | } | ||
251 | ret = 0; | ||
252 | } | ||
253 | |||
254 | lru_add_drain_all(); /* to update stats */ | ||
255 | |||
256 | return ret; /* count entire vma as locked_vm */ | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * convert get_user_pages() return value to posix mlock() error | ||
261 | */ | ||
262 | static int __mlock_posix_error_return(long retval) | ||
263 | { | ||
264 | if (retval == -EFAULT) | ||
265 | retval = -ENOMEM; | ||
266 | else if (retval == -ENOMEM) | ||
267 | retval = -EAGAIN; | ||
268 | return retval; | ||
269 | } | ||
270 | |||
271 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
272 | |||
273 | /* | ||
274 | * Just make pages present if VM_LOCKED. No-op if unlocking. | ||
275 | */ | ||
276 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | ||
277 | unsigned long start, unsigned long end, | ||
278 | int mlock) | ||
279 | { | ||
280 | if (mlock && (vma->vm_flags & VM_LOCKED)) | ||
281 | return make_pages_present(start, end); | ||
282 | return 0; | ||
283 | } | ||
284 | |||
285 | static inline int __mlock_posix_error_return(long retval) | ||
286 | { | ||
287 | return 0; | ||
288 | } | ||
289 | |||
290 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
291 | |||
292 | /** | ||
293 | * mlock_vma_pages_range() - mlock pages in specified vma range. | ||
294 | * @vma - the vma containing the specfied address range | ||
295 | * @start - starting address in @vma to mlock | ||
296 | * @end - end address [+1] in @vma to mlock | ||
297 | * | ||
298 | * For mmap()/mremap()/expansion of mlocked vma. | ||
299 | * | ||
300 | * return 0 on success for "normal" vmas. | ||
301 | * | ||
302 | * return number of pages [> 0] to be removed from locked_vm on success | ||
303 | * of "special" vmas. | ||
304 | * | ||
305 | * return negative error if vma spanning @start-@range disappears while | ||
306 | * mmap semaphore is dropped. Unlikely? | ||
307 | */ | ||
308 | long mlock_vma_pages_range(struct vm_area_struct *vma, | ||
309 | unsigned long start, unsigned long end) | ||
310 | { | ||
311 | struct mm_struct *mm = vma->vm_mm; | ||
312 | int nr_pages = (end - start) / PAGE_SIZE; | ||
313 | BUG_ON(!(vma->vm_flags & VM_LOCKED)); | ||
314 | |||
315 | /* | ||
316 | * filter unlockable vmas | ||
317 | */ | ||
318 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | ||
319 | goto no_mlock; | ||
320 | |||
321 | if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | ||
322 | is_vm_hugetlb_page(vma) || | ||
323 | vma == get_gate_vma(current))) { | ||
324 | long error; | ||
325 | downgrade_write(&mm->mmap_sem); | ||
326 | |||
327 | error = __mlock_vma_pages_range(vma, start, end, 1); | ||
328 | |||
329 | up_read(&mm->mmap_sem); | ||
330 | /* vma can change or disappear */ | ||
331 | down_write(&mm->mmap_sem); | ||
332 | vma = find_vma(mm, start); | ||
333 | /* non-NULL vma must contain @start, but need to check @end */ | ||
334 | if (!vma || end > vma->vm_end) | ||
335 | return -ENOMEM; | ||
336 | |||
337 | return 0; /* hide other errors from mmap(), et al */ | ||
338 | } | ||
339 | |||
340 | /* | ||
341 | * User mapped kernel pages or huge pages: | ||
342 | * make these pages present to populate the ptes, but | ||
343 | * fall thru' to reset VM_LOCKED--no need to unlock, and | ||
344 | * return nr_pages so these don't get counted against task's | ||
345 | * locked limit. huge pages are already counted against | ||
346 | * locked vm limit. | ||
347 | */ | ||
348 | make_pages_present(start, end); | ||
349 | |||
350 | no_mlock: | ||
351 | vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ | ||
352 | return nr_pages; /* error or pages NOT mlocked */ | ||
353 | } | ||
354 | |||
355 | |||
356 | /* | ||
357 | * munlock_vma_pages_range() - munlock all pages in the vma range.' | ||
358 | * @vma - vma containing range to be munlock()ed. | ||
359 | * @start - start address in @vma of the range | ||
360 | * @end - end of range in @vma. | ||
361 | * | ||
362 | * For mremap(), munmap() and exit(). | ||
363 | * | ||
364 | * Called with @vma VM_LOCKED. | ||
365 | * | ||
366 | * Returns with VM_LOCKED cleared. Callers must be prepared to | ||
367 | * deal with this. | ||
368 | * | ||
369 | * We don't save and restore VM_LOCKED here because pages are | ||
370 | * still on lru. In unmap path, pages might be scanned by reclaim | ||
371 | * and re-mlocked by try_to_{munlock|unmap} before we unmap and | ||
372 | * free them. This will result in freeing mlocked pages. | ||
373 | */ | ||
374 | void munlock_vma_pages_range(struct vm_area_struct *vma, | ||
375 | unsigned long start, unsigned long end) | ||
376 | { | ||
377 | vma->vm_flags &= ~VM_LOCKED; | ||
378 | __mlock_vma_pages_range(vma, start, end, 0); | ||
379 | } | ||
380 | |||
381 | /* | ||
382 | * mlock_fixup - handle mlock[all]/munlock[all] requests. | ||
383 | * | ||
384 | * Filters out "special" vmas -- VM_LOCKED never gets set for these, and | ||
385 | * munlock is a no-op. However, for some special vmas, we go ahead and | ||
386 | * populate the ptes via make_pages_present(). | ||
387 | * | ||
388 | * For vmas that pass the filters, merge/split as appropriate. | ||
389 | */ | ||
26 | static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | 390 | static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, |
27 | unsigned long start, unsigned long end, unsigned int newflags) | 391 | unsigned long start, unsigned long end, unsigned int newflags) |
28 | { | 392 | { |
29 | struct mm_struct * mm = vma->vm_mm; | 393 | struct mm_struct *mm = vma->vm_mm; |
30 | pgoff_t pgoff; | 394 | pgoff_t pgoff; |
31 | int pages; | 395 | int nr_pages; |
32 | int ret = 0; | 396 | int ret = 0; |
33 | 397 | int lock = newflags & VM_LOCKED; | |
34 | if (newflags == vma->vm_flags) { | 398 | |
35 | *prev = vma; | 399 | if (newflags == vma->vm_flags || |
36 | goto out; | 400 | (vma->vm_flags & (VM_IO | VM_PFNMAP))) |
401 | goto out; /* don't set VM_LOCKED, don't count */ | ||
402 | |||
403 | if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | ||
404 | is_vm_hugetlb_page(vma) || | ||
405 | vma == get_gate_vma(current)) { | ||
406 | if (lock) | ||
407 | make_pages_present(start, end); | ||
408 | goto out; /* don't set VM_LOCKED, don't count */ | ||
37 | } | 409 | } |
38 | 410 | ||
39 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | 411 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
@@ -44,8 +416,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
44 | goto success; | 416 | goto success; |
45 | } | 417 | } |
46 | 418 | ||
47 | *prev = vma; | ||
48 | |||
49 | if (start != vma->vm_start) { | 419 | if (start != vma->vm_start) { |
50 | ret = split_vma(mm, vma, start, 1); | 420 | ret = split_vma(mm, vma, start, 1); |
51 | if (ret) | 421 | if (ret) |
@@ -60,24 +430,61 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
60 | 430 | ||
61 | success: | 431 | success: |
62 | /* | 432 | /* |
433 | * Keep track of amount of locked VM. | ||
434 | */ | ||
435 | nr_pages = (end - start) >> PAGE_SHIFT; | ||
436 | if (!lock) | ||
437 | nr_pages = -nr_pages; | ||
438 | mm->locked_vm += nr_pages; | ||
439 | |||
440 | /* | ||
63 | * vm_flags is protected by the mmap_sem held in write mode. | 441 | * vm_flags is protected by the mmap_sem held in write mode. |
64 | * It's okay if try_to_unmap_one unmaps a page just after we | 442 | * It's okay if try_to_unmap_one unmaps a page just after we |
65 | * set VM_LOCKED, make_pages_present below will bring it back. | 443 | * set VM_LOCKED, __mlock_vma_pages_range will bring it back. |
66 | */ | 444 | */ |
67 | vma->vm_flags = newflags; | 445 | vma->vm_flags = newflags; |
68 | 446 | ||
69 | /* | 447 | if (lock) { |
70 | * Keep track of amount of locked VM. | 448 | /* |
71 | */ | 449 | * mmap_sem is currently held for write. Downgrade the write |
72 | pages = (end - start) >> PAGE_SHIFT; | 450 | * lock to a read lock so that other faults, mmap scans, ... |
73 | if (newflags & VM_LOCKED) { | 451 | * while we fault in all pages. |
74 | pages = -pages; | 452 | */ |
75 | if (!(newflags & VM_IO)) | 453 | downgrade_write(&mm->mmap_sem); |
76 | ret = make_pages_present(start, end); | 454 | |
455 | ret = __mlock_vma_pages_range(vma, start, end, 1); | ||
456 | |||
457 | /* | ||
458 | * Need to reacquire mmap sem in write mode, as our callers | ||
459 | * expect this. We have no support for atomically upgrading | ||
460 | * a sem to write, so we need to check for ranges while sem | ||
461 | * is unlocked. | ||
462 | */ | ||
463 | up_read(&mm->mmap_sem); | ||
464 | /* vma can change or disappear */ | ||
465 | down_write(&mm->mmap_sem); | ||
466 | *prev = find_vma(mm, start); | ||
467 | /* non-NULL *prev must contain @start, but need to check @end */ | ||
468 | if (!(*prev) || end > (*prev)->vm_end) | ||
469 | ret = -ENOMEM; | ||
470 | else if (ret > 0) { | ||
471 | mm->locked_vm -= ret; | ||
472 | ret = 0; | ||
473 | } else | ||
474 | ret = __mlock_posix_error_return(ret); /* translate if needed */ | ||
475 | } else { | ||
476 | /* | ||
477 | * TODO: for unlocking, pages will already be resident, so | ||
478 | * we don't need to wait for allocations/reclaim/pagein, ... | ||
479 | * However, unlocking a very large region can still take a | ||
480 | * while. Should we downgrade the semaphore for both lock | ||
481 | * AND unlock ? | ||
482 | */ | ||
483 | __mlock_vma_pages_range(vma, start, end, 0); | ||
77 | } | 484 | } |
78 | 485 | ||
79 | mm->locked_vm -= pages; | ||
80 | out: | 486 | out: |
487 | *prev = vma; | ||
81 | return ret; | 488 | return ret; |
82 | } | 489 | } |
83 | 490 | ||