diff options
Diffstat (limited to 'mm/rmap.c')
-rw-r--r-- | mm/rmap.c | 536 |
1 files changed, 348 insertions, 188 deletions
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/swapops.h> | 49 | #include <linux/swapops.h> |
50 | #include <linux/slab.h> | 50 | #include <linux/slab.h> |
51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
52 | #include <linux/ksm.h> | ||
52 | #include <linux/rmap.h> | 53 | #include <linux/rmap.h> |
53 | #include <linux/rcupdate.h> | 54 | #include <linux/rcupdate.h> |
54 | #include <linux/module.h> | 55 | #include <linux/module.h> |
@@ -61,17 +62,28 @@ | |||
61 | #include "internal.h" | 62 | #include "internal.h" |
62 | 63 | ||
63 | static struct kmem_cache *anon_vma_cachep; | 64 | static struct kmem_cache *anon_vma_cachep; |
65 | static struct kmem_cache *anon_vma_chain_cachep; | ||
64 | 66 | ||
65 | static inline struct anon_vma *anon_vma_alloc(void) | 67 | static inline struct anon_vma *anon_vma_alloc(void) |
66 | { | 68 | { |
67 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | 69 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); |
68 | } | 70 | } |
69 | 71 | ||
70 | static inline void anon_vma_free(struct anon_vma *anon_vma) | 72 | void anon_vma_free(struct anon_vma *anon_vma) |
71 | { | 73 | { |
72 | kmem_cache_free(anon_vma_cachep, anon_vma); | 74 | kmem_cache_free(anon_vma_cachep, anon_vma); |
73 | } | 75 | } |
74 | 76 | ||
77 | static inline struct anon_vma_chain *anon_vma_chain_alloc(void) | ||
78 | { | ||
79 | return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); | ||
80 | } | ||
81 | |||
82 | void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | ||
83 | { | ||
84 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); | ||
85 | } | ||
86 | |||
75 | /** | 87 | /** |
76 | * anon_vma_prepare - attach an anon_vma to a memory region | 88 | * anon_vma_prepare - attach an anon_vma to a memory region |
77 | * @vma: the memory region in question | 89 | * @vma: the memory region in question |
@@ -102,18 +114,23 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
102 | int anon_vma_prepare(struct vm_area_struct *vma) | 114 | int anon_vma_prepare(struct vm_area_struct *vma) |
103 | { | 115 | { |
104 | struct anon_vma *anon_vma = vma->anon_vma; | 116 | struct anon_vma *anon_vma = vma->anon_vma; |
117 | struct anon_vma_chain *avc; | ||
105 | 118 | ||
106 | might_sleep(); | 119 | might_sleep(); |
107 | if (unlikely(!anon_vma)) { | 120 | if (unlikely(!anon_vma)) { |
108 | struct mm_struct *mm = vma->vm_mm; | 121 | struct mm_struct *mm = vma->vm_mm; |
109 | struct anon_vma *allocated; | 122 | struct anon_vma *allocated; |
110 | 123 | ||
124 | avc = anon_vma_chain_alloc(); | ||
125 | if (!avc) | ||
126 | goto out_enomem; | ||
127 | |||
111 | anon_vma = find_mergeable_anon_vma(vma); | 128 | anon_vma = find_mergeable_anon_vma(vma); |
112 | allocated = NULL; | 129 | allocated = NULL; |
113 | if (!anon_vma) { | 130 | if (!anon_vma) { |
114 | anon_vma = anon_vma_alloc(); | 131 | anon_vma = anon_vma_alloc(); |
115 | if (unlikely(!anon_vma)) | 132 | if (unlikely(!anon_vma)) |
116 | return -ENOMEM; | 133 | goto out_enomem_free_avc; |
117 | allocated = anon_vma; | 134 | allocated = anon_vma; |
118 | } | 135 | } |
119 | spin_lock(&anon_vma->lock); | 136 | spin_lock(&anon_vma->lock); |
@@ -122,67 +139,141 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
122 | spin_lock(&mm->page_table_lock); | 139 | spin_lock(&mm->page_table_lock); |
123 | if (likely(!vma->anon_vma)) { | 140 | if (likely(!vma->anon_vma)) { |
124 | vma->anon_vma = anon_vma; | 141 | vma->anon_vma = anon_vma; |
125 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 142 | avc->anon_vma = anon_vma; |
143 | avc->vma = vma; | ||
144 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
145 | list_add(&avc->same_anon_vma, &anon_vma->head); | ||
126 | allocated = NULL; | 146 | allocated = NULL; |
127 | } | 147 | } |
128 | spin_unlock(&mm->page_table_lock); | 148 | spin_unlock(&mm->page_table_lock); |
129 | 149 | ||
130 | spin_unlock(&anon_vma->lock); | 150 | spin_unlock(&anon_vma->lock); |
131 | if (unlikely(allocated)) | 151 | if (unlikely(allocated)) { |
132 | anon_vma_free(allocated); | 152 | anon_vma_free(allocated); |
153 | anon_vma_chain_free(avc); | ||
154 | } | ||
133 | } | 155 | } |
134 | return 0; | 156 | return 0; |
157 | |||
158 | out_enomem_free_avc: | ||
159 | anon_vma_chain_free(avc); | ||
160 | out_enomem: | ||
161 | return -ENOMEM; | ||
135 | } | 162 | } |
136 | 163 | ||
137 | void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) | 164 | static void anon_vma_chain_link(struct vm_area_struct *vma, |
165 | struct anon_vma_chain *avc, | ||
166 | struct anon_vma *anon_vma) | ||
138 | { | 167 | { |
139 | BUG_ON(vma->anon_vma != next->anon_vma); | 168 | avc->vma = vma; |
140 | list_del(&next->anon_vma_node); | 169 | avc->anon_vma = anon_vma; |
170 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
171 | |||
172 | spin_lock(&anon_vma->lock); | ||
173 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
174 | spin_unlock(&anon_vma->lock); | ||
141 | } | 175 | } |
142 | 176 | ||
143 | void __anon_vma_link(struct vm_area_struct *vma) | 177 | /* |
178 | * Attach the anon_vmas from src to dst. | ||
179 | * Returns 0 on success, -ENOMEM on failure. | ||
180 | */ | ||
181 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | ||
144 | { | 182 | { |
145 | struct anon_vma *anon_vma = vma->anon_vma; | 183 | struct anon_vma_chain *avc, *pavc; |
184 | |||
185 | list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) { | ||
186 | avc = anon_vma_chain_alloc(); | ||
187 | if (!avc) | ||
188 | goto enomem_failure; | ||
189 | anon_vma_chain_link(dst, avc, pavc->anon_vma); | ||
190 | } | ||
191 | return 0; | ||
146 | 192 | ||
147 | if (anon_vma) | 193 | enomem_failure: |
148 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 194 | unlink_anon_vmas(dst); |
195 | return -ENOMEM; | ||
149 | } | 196 | } |
150 | 197 | ||
151 | void anon_vma_link(struct vm_area_struct *vma) | 198 | /* |
199 | * Attach vma to its own anon_vma, as well as to the anon_vmas that | ||
200 | * the corresponding VMA in the parent process is attached to. | ||
201 | * Returns 0 on success, non-zero on failure. | ||
202 | */ | ||
203 | int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | ||
152 | { | 204 | { |
153 | struct anon_vma *anon_vma = vma->anon_vma; | 205 | struct anon_vma_chain *avc; |
206 | struct anon_vma *anon_vma; | ||
154 | 207 | ||
155 | if (anon_vma) { | 208 | /* Don't bother if the parent process has no anon_vma here. */ |
156 | spin_lock(&anon_vma->lock); | 209 | if (!pvma->anon_vma) |
157 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 210 | return 0; |
158 | spin_unlock(&anon_vma->lock); | 211 | |
159 | } | 212 | /* |
213 | * First, attach the new VMA to the parent VMA's anon_vmas, | ||
214 | * so rmap can find non-COWed pages in child processes. | ||
215 | */ | ||
216 | if (anon_vma_clone(vma, pvma)) | ||
217 | return -ENOMEM; | ||
218 | |||
219 | /* Then add our own anon_vma. */ | ||
220 | anon_vma = anon_vma_alloc(); | ||
221 | if (!anon_vma) | ||
222 | goto out_error; | ||
223 | avc = anon_vma_chain_alloc(); | ||
224 | if (!avc) | ||
225 | goto out_error_free_anon_vma; | ||
226 | anon_vma_chain_link(vma, avc, anon_vma); | ||
227 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | ||
228 | vma->anon_vma = anon_vma; | ||
229 | |||
230 | return 0; | ||
231 | |||
232 | out_error_free_anon_vma: | ||
233 | anon_vma_free(anon_vma); | ||
234 | out_error: | ||
235 | unlink_anon_vmas(vma); | ||
236 | return -ENOMEM; | ||
160 | } | 237 | } |
161 | 238 | ||
162 | void anon_vma_unlink(struct vm_area_struct *vma) | 239 | static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) |
163 | { | 240 | { |
164 | struct anon_vma *anon_vma = vma->anon_vma; | 241 | struct anon_vma *anon_vma = anon_vma_chain->anon_vma; |
165 | int empty; | 242 | int empty; |
166 | 243 | ||
244 | /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ | ||
167 | if (!anon_vma) | 245 | if (!anon_vma) |
168 | return; | 246 | return; |
169 | 247 | ||
170 | spin_lock(&anon_vma->lock); | 248 | spin_lock(&anon_vma->lock); |
171 | list_del(&vma->anon_vma_node); | 249 | list_del(&anon_vma_chain->same_anon_vma); |
172 | 250 | ||
173 | /* We must garbage collect the anon_vma if it's empty */ | 251 | /* We must garbage collect the anon_vma if it's empty */ |
174 | empty = list_empty(&anon_vma->head); | 252 | empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); |
175 | spin_unlock(&anon_vma->lock); | 253 | spin_unlock(&anon_vma->lock); |
176 | 254 | ||
177 | if (empty) | 255 | if (empty) |
178 | anon_vma_free(anon_vma); | 256 | anon_vma_free(anon_vma); |
179 | } | 257 | } |
180 | 258 | ||
259 | void unlink_anon_vmas(struct vm_area_struct *vma) | ||
260 | { | ||
261 | struct anon_vma_chain *avc, *next; | ||
262 | |||
263 | /* Unlink each anon_vma chained to the VMA. */ | ||
264 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | ||
265 | anon_vma_unlink(avc); | ||
266 | list_del(&avc->same_vma); | ||
267 | anon_vma_chain_free(avc); | ||
268 | } | ||
269 | } | ||
270 | |||
181 | static void anon_vma_ctor(void *data) | 271 | static void anon_vma_ctor(void *data) |
182 | { | 272 | { |
183 | struct anon_vma *anon_vma = data; | 273 | struct anon_vma *anon_vma = data; |
184 | 274 | ||
185 | spin_lock_init(&anon_vma->lock); | 275 | spin_lock_init(&anon_vma->lock); |
276 | ksm_refcount_init(anon_vma); | ||
186 | INIT_LIST_HEAD(&anon_vma->head); | 277 | INIT_LIST_HEAD(&anon_vma->head); |
187 | } | 278 | } |
188 | 279 | ||
@@ -190,6 +281,7 @@ void __init anon_vma_init(void) | |||
190 | { | 281 | { |
191 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), | 282 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), |
192 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); | 283 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); |
284 | anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); | ||
193 | } | 285 | } |
194 | 286 | ||
195 | /* | 287 | /* |
@@ -202,8 +294,8 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
202 | unsigned long anon_mapping; | 294 | unsigned long anon_mapping; |
203 | 295 | ||
204 | rcu_read_lock(); | 296 | rcu_read_lock(); |
205 | anon_mapping = (unsigned long) page->mapping; | 297 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); |
206 | if (!(anon_mapping & PAGE_MAPPING_ANON)) | 298 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
207 | goto out; | 299 | goto out; |
208 | if (!page_mapped(page)) | 300 | if (!page_mapped(page)) |
209 | goto out; | 301 | goto out; |
@@ -248,8 +340,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
248 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 340 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
249 | { | 341 | { |
250 | if (PageAnon(page)) { | 342 | if (PageAnon(page)) { |
251 | if ((void *)vma->anon_vma != | 343 | if (vma->anon_vma != page_anon_vma(page)) |
252 | (void *)page->mapping - PAGE_MAPPING_ANON) | ||
253 | return -EFAULT; | 344 | return -EFAULT; |
254 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { | 345 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { |
255 | if (!vma->vm_file || | 346 | if (!vma->vm_file || |
@@ -337,21 +428,15 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
337 | * Subfunctions of page_referenced: page_referenced_one called | 428 | * Subfunctions of page_referenced: page_referenced_one called |
338 | * repeatedly from either page_referenced_anon or page_referenced_file. | 429 | * repeatedly from either page_referenced_anon or page_referenced_file. |
339 | */ | 430 | */ |
340 | static int page_referenced_one(struct page *page, | 431 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, |
341 | struct vm_area_struct *vma, | 432 | unsigned long address, unsigned int *mapcount, |
342 | unsigned int *mapcount, | 433 | unsigned long *vm_flags) |
343 | unsigned long *vm_flags) | ||
344 | { | 434 | { |
345 | struct mm_struct *mm = vma->vm_mm; | 435 | struct mm_struct *mm = vma->vm_mm; |
346 | unsigned long address; | ||
347 | pte_t *pte; | 436 | pte_t *pte; |
348 | spinlock_t *ptl; | 437 | spinlock_t *ptl; |
349 | int referenced = 0; | 438 | int referenced = 0; |
350 | 439 | ||
351 | address = vma_address(page, vma); | ||
352 | if (address == -EFAULT) | ||
353 | goto out; | ||
354 | |||
355 | pte = page_check_address(page, mm, address, &ptl, 0); | 440 | pte = page_check_address(page, mm, address, &ptl, 0); |
356 | if (!pte) | 441 | if (!pte) |
357 | goto out; | 442 | goto out; |
@@ -388,9 +473,10 @@ static int page_referenced_one(struct page *page, | |||
388 | out_unmap: | 473 | out_unmap: |
389 | (*mapcount)--; | 474 | (*mapcount)--; |
390 | pte_unmap_unlock(pte, ptl); | 475 | pte_unmap_unlock(pte, ptl); |
391 | out: | 476 | |
392 | if (referenced) | 477 | if (referenced) |
393 | *vm_flags |= vma->vm_flags; | 478 | *vm_flags |= vma->vm_flags; |
479 | out: | ||
394 | return referenced; | 480 | return referenced; |
395 | } | 481 | } |
396 | 482 | ||
@@ -400,7 +486,7 @@ static int page_referenced_anon(struct page *page, | |||
400 | { | 486 | { |
401 | unsigned int mapcount; | 487 | unsigned int mapcount; |
402 | struct anon_vma *anon_vma; | 488 | struct anon_vma *anon_vma; |
403 | struct vm_area_struct *vma; | 489 | struct anon_vma_chain *avc; |
404 | int referenced = 0; | 490 | int referenced = 0; |
405 | 491 | ||
406 | anon_vma = page_lock_anon_vma(page); | 492 | anon_vma = page_lock_anon_vma(page); |
@@ -408,7 +494,11 @@ static int page_referenced_anon(struct page *page, | |||
408 | return referenced; | 494 | return referenced; |
409 | 495 | ||
410 | mapcount = page_mapcount(page); | 496 | mapcount = page_mapcount(page); |
411 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 497 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
498 | struct vm_area_struct *vma = avc->vma; | ||
499 | unsigned long address = vma_address(page, vma); | ||
500 | if (address == -EFAULT) | ||
501 | continue; | ||
412 | /* | 502 | /* |
413 | * If we are reclaiming on behalf of a cgroup, skip | 503 | * If we are reclaiming on behalf of a cgroup, skip |
414 | * counting on behalf of references from different | 504 | * counting on behalf of references from different |
@@ -416,7 +506,7 @@ static int page_referenced_anon(struct page *page, | |||
416 | */ | 506 | */ |
417 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 507 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
418 | continue; | 508 | continue; |
419 | referenced += page_referenced_one(page, vma, | 509 | referenced += page_referenced_one(page, vma, address, |
420 | &mapcount, vm_flags); | 510 | &mapcount, vm_flags); |
421 | if (!mapcount) | 511 | if (!mapcount) |
422 | break; | 512 | break; |
@@ -474,6 +564,9 @@ static int page_referenced_file(struct page *page, | |||
474 | mapcount = page_mapcount(page); | 564 | mapcount = page_mapcount(page); |
475 | 565 | ||
476 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 566 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
567 | unsigned long address = vma_address(page, vma); | ||
568 | if (address == -EFAULT) | ||
569 | continue; | ||
477 | /* | 570 | /* |
478 | * If we are reclaiming on behalf of a cgroup, skip | 571 | * If we are reclaiming on behalf of a cgroup, skip |
479 | * counting on behalf of references from different | 572 | * counting on behalf of references from different |
@@ -481,7 +574,7 @@ static int page_referenced_file(struct page *page, | |||
481 | */ | 574 | */ |
482 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 575 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
483 | continue; | 576 | continue; |
484 | referenced += page_referenced_one(page, vma, | 577 | referenced += page_referenced_one(page, vma, address, |
485 | &mapcount, vm_flags); | 578 | &mapcount, vm_flags); |
486 | if (!mapcount) | 579 | if (!mapcount) |
487 | break; | 580 | break; |
@@ -507,46 +600,44 @@ int page_referenced(struct page *page, | |||
507 | unsigned long *vm_flags) | 600 | unsigned long *vm_flags) |
508 | { | 601 | { |
509 | int referenced = 0; | 602 | int referenced = 0; |
510 | 603 | int we_locked = 0; | |
511 | if (TestClearPageReferenced(page)) | ||
512 | referenced++; | ||
513 | 604 | ||
514 | *vm_flags = 0; | 605 | *vm_flags = 0; |
515 | if (page_mapped(page) && page->mapping) { | 606 | if (page_mapped(page) && page_rmapping(page)) { |
516 | if (PageAnon(page)) | 607 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { |
608 | we_locked = trylock_page(page); | ||
609 | if (!we_locked) { | ||
610 | referenced++; | ||
611 | goto out; | ||
612 | } | ||
613 | } | ||
614 | if (unlikely(PageKsm(page))) | ||
615 | referenced += page_referenced_ksm(page, mem_cont, | ||
616 | vm_flags); | ||
617 | else if (PageAnon(page)) | ||
517 | referenced += page_referenced_anon(page, mem_cont, | 618 | referenced += page_referenced_anon(page, mem_cont, |
518 | vm_flags); | 619 | vm_flags); |
519 | else if (is_locked) | 620 | else if (page->mapping) |
520 | referenced += page_referenced_file(page, mem_cont, | 621 | referenced += page_referenced_file(page, mem_cont, |
521 | vm_flags); | 622 | vm_flags); |
522 | else if (!trylock_page(page)) | 623 | if (we_locked) |
523 | referenced++; | ||
524 | else { | ||
525 | if (page->mapping) | ||
526 | referenced += page_referenced_file(page, | ||
527 | mem_cont, vm_flags); | ||
528 | unlock_page(page); | 624 | unlock_page(page); |
529 | } | ||
530 | } | 625 | } |
531 | 626 | out: | |
532 | if (page_test_and_clear_young(page)) | 627 | if (page_test_and_clear_young(page)) |
533 | referenced++; | 628 | referenced++; |
534 | 629 | ||
535 | return referenced; | 630 | return referenced; |
536 | } | 631 | } |
537 | 632 | ||
538 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | 633 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, |
634 | unsigned long address) | ||
539 | { | 635 | { |
540 | struct mm_struct *mm = vma->vm_mm; | 636 | struct mm_struct *mm = vma->vm_mm; |
541 | unsigned long address; | ||
542 | pte_t *pte; | 637 | pte_t *pte; |
543 | spinlock_t *ptl; | 638 | spinlock_t *ptl; |
544 | int ret = 0; | 639 | int ret = 0; |
545 | 640 | ||
546 | address = vma_address(page, vma); | ||
547 | if (address == -EFAULT) | ||
548 | goto out; | ||
549 | |||
550 | pte = page_check_address(page, mm, address, &ptl, 1); | 641 | pte = page_check_address(page, mm, address, &ptl, 1); |
551 | if (!pte) | 642 | if (!pte) |
552 | goto out; | 643 | goto out; |
@@ -578,8 +669,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
578 | 669 | ||
579 | spin_lock(&mapping->i_mmap_lock); | 670 | spin_lock(&mapping->i_mmap_lock); |
580 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 671 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
581 | if (vma->vm_flags & VM_SHARED) | 672 | if (vma->vm_flags & VM_SHARED) { |
582 | ret += page_mkclean_one(page, vma); | 673 | unsigned long address = vma_address(page, vma); |
674 | if (address == -EFAULT) | ||
675 | continue; | ||
676 | ret += page_mkclean_one(page, vma, address); | ||
677 | } | ||
583 | } | 678 | } |
584 | spin_unlock(&mapping->i_mmap_lock); | 679 | spin_unlock(&mapping->i_mmap_lock); |
585 | return ret; | 680 | return ret; |
@@ -607,6 +702,30 @@ int page_mkclean(struct page *page) | |||
607 | EXPORT_SYMBOL_GPL(page_mkclean); | 702 | EXPORT_SYMBOL_GPL(page_mkclean); |
608 | 703 | ||
609 | /** | 704 | /** |
705 | * page_move_anon_rmap - move a page to our anon_vma | ||
706 | * @page: the page to move to our anon_vma | ||
707 | * @vma: the vma the page belongs to | ||
708 | * @address: the user virtual address mapped | ||
709 | * | ||
710 | * When a page belongs exclusively to one process after a COW event, | ||
711 | * that page can be moved into the anon_vma that belongs to just that | ||
712 | * process, so the rmap code will not search the parent or sibling | ||
713 | * processes. | ||
714 | */ | ||
715 | void page_move_anon_rmap(struct page *page, | ||
716 | struct vm_area_struct *vma, unsigned long address) | ||
717 | { | ||
718 | struct anon_vma *anon_vma = vma->anon_vma; | ||
719 | |||
720 | VM_BUG_ON(!PageLocked(page)); | ||
721 | VM_BUG_ON(!anon_vma); | ||
722 | VM_BUG_ON(page->index != linear_page_index(vma, address)); | ||
723 | |||
724 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
725 | page->mapping = (struct address_space *) anon_vma; | ||
726 | } | ||
727 | |||
728 | /** | ||
610 | * __page_set_anon_rmap - setup new anonymous rmap | 729 | * __page_set_anon_rmap - setup new anonymous rmap |
611 | * @page: the page to add the mapping to | 730 | * @page: the page to add the mapping to |
612 | * @vma: the vm area in which the mapping is added | 731 | * @vma: the vm area in which the mapping is added |
@@ -620,14 +739,7 @@ static void __page_set_anon_rmap(struct page *page, | |||
620 | BUG_ON(!anon_vma); | 739 | BUG_ON(!anon_vma); |
621 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 740 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
622 | page->mapping = (struct address_space *) anon_vma; | 741 | page->mapping = (struct address_space *) anon_vma; |
623 | |||
624 | page->index = linear_page_index(vma, address); | 742 | page->index = linear_page_index(vma, address); |
625 | |||
626 | /* | ||
627 | * nr_mapped state can be updated without turning off | ||
628 | * interrupts because it is not modified via interrupt. | ||
629 | */ | ||
630 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
631 | } | 743 | } |
632 | 744 | ||
633 | /** | 745 | /** |
@@ -652,9 +764,6 @@ static void __page_check_anon_rmap(struct page *page, | |||
652 | * are initially only visible via the pagetables, and the pte is locked | 764 | * are initially only visible via the pagetables, and the pte is locked |
653 | * over the call to page_add_new_anon_rmap. | 765 | * over the call to page_add_new_anon_rmap. |
654 | */ | 766 | */ |
655 | struct anon_vma *anon_vma = vma->anon_vma; | ||
656 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
657 | BUG_ON(page->mapping != (struct address_space *)anon_vma); | ||
658 | BUG_ON(page->index != linear_page_index(vma, address)); | 767 | BUG_ON(page->index != linear_page_index(vma, address)); |
659 | #endif | 768 | #endif |
660 | } | 769 | } |
@@ -665,14 +774,23 @@ static void __page_check_anon_rmap(struct page *page, | |||
665 | * @vma: the vm area in which the mapping is added | 774 | * @vma: the vm area in which the mapping is added |
666 | * @address: the user virtual address mapped | 775 | * @address: the user virtual address mapped |
667 | * | 776 | * |
668 | * The caller needs to hold the pte lock and the page must be locked. | 777 | * The caller needs to hold the pte lock, and the page must be locked in |
778 | * the anon_vma case: to serialize mapping,index checking after setting, | ||
779 | * and to ensure that PageAnon is not being upgraded racily to PageKsm | ||
780 | * (but PageKsm is never downgraded to PageAnon). | ||
669 | */ | 781 | */ |
670 | void page_add_anon_rmap(struct page *page, | 782 | void page_add_anon_rmap(struct page *page, |
671 | struct vm_area_struct *vma, unsigned long address) | 783 | struct vm_area_struct *vma, unsigned long address) |
672 | { | 784 | { |
785 | int first = atomic_inc_and_test(&page->_mapcount); | ||
786 | if (first) | ||
787 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
788 | if (unlikely(PageKsm(page))) | ||
789 | return; | ||
790 | |||
673 | VM_BUG_ON(!PageLocked(page)); | 791 | VM_BUG_ON(!PageLocked(page)); |
674 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 792 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
675 | if (atomic_inc_and_test(&page->_mapcount)) | 793 | if (first) |
676 | __page_set_anon_rmap(page, vma, address); | 794 | __page_set_anon_rmap(page, vma, address); |
677 | else | 795 | else |
678 | __page_check_anon_rmap(page, vma, address); | 796 | __page_check_anon_rmap(page, vma, address); |
@@ -694,6 +812,7 @@ void page_add_new_anon_rmap(struct page *page, | |||
694 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 812 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
695 | SetPageSwapBacked(page); | 813 | SetPageSwapBacked(page); |
696 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 814 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
815 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
697 | __page_set_anon_rmap(page, vma, address); | 816 | __page_set_anon_rmap(page, vma, address); |
698 | if (page_evictable(page, vma)) | 817 | if (page_evictable(page, vma)) |
699 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 818 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
@@ -711,7 +830,7 @@ void page_add_file_rmap(struct page *page) | |||
711 | { | 830 | { |
712 | if (atomic_inc_and_test(&page->_mapcount)) { | 831 | if (atomic_inc_and_test(&page->_mapcount)) { |
713 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 832 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
714 | mem_cgroup_update_mapped_file_stat(page, 1); | 833 | mem_cgroup_update_file_mapped(page, 1); |
715 | } | 834 | } |
716 | } | 835 | } |
717 | 836 | ||
@@ -743,8 +862,8 @@ void page_remove_rmap(struct page *page) | |||
743 | __dec_zone_page_state(page, NR_ANON_PAGES); | 862 | __dec_zone_page_state(page, NR_ANON_PAGES); |
744 | } else { | 863 | } else { |
745 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 864 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
865 | mem_cgroup_update_file_mapped(page, -1); | ||
746 | } | 866 | } |
747 | mem_cgroup_update_mapped_file_stat(page, -1); | ||
748 | /* | 867 | /* |
749 | * It would be tidy to reset the PageAnon mapping here, | 868 | * It would be tidy to reset the PageAnon mapping here, |
750 | * but that might overwrite a racing page_add_anon_rmap | 869 | * but that might overwrite a racing page_add_anon_rmap |
@@ -760,20 +879,15 @@ void page_remove_rmap(struct page *page) | |||
760 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 879 | * Subfunctions of try_to_unmap: try_to_unmap_one called |
761 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 880 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. |
762 | */ | 881 | */ |
763 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 882 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
764 | enum ttu_flags flags) | 883 | unsigned long address, enum ttu_flags flags) |
765 | { | 884 | { |
766 | struct mm_struct *mm = vma->vm_mm; | 885 | struct mm_struct *mm = vma->vm_mm; |
767 | unsigned long address; | ||
768 | pte_t *pte; | 886 | pte_t *pte; |
769 | pte_t pteval; | 887 | pte_t pteval; |
770 | spinlock_t *ptl; | 888 | spinlock_t *ptl; |
771 | int ret = SWAP_AGAIN; | 889 | int ret = SWAP_AGAIN; |
772 | 890 | ||
773 | address = vma_address(page, vma); | ||
774 | if (address == -EFAULT) | ||
775 | goto out; | ||
776 | |||
777 | pte = page_check_address(page, mm, address, &ptl, 0); | 891 | pte = page_check_address(page, mm, address, &ptl, 0); |
778 | if (!pte) | 892 | if (!pte) |
779 | goto out; | 893 | goto out; |
@@ -784,10 +898,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
784 | * skipped over this mm) then we should reactivate it. | 898 | * skipped over this mm) then we should reactivate it. |
785 | */ | 899 | */ |
786 | if (!(flags & TTU_IGNORE_MLOCK)) { | 900 | if (!(flags & TTU_IGNORE_MLOCK)) { |
787 | if (vma->vm_flags & VM_LOCKED) { | 901 | if (vma->vm_flags & VM_LOCKED) |
788 | ret = SWAP_MLOCK; | 902 | goto out_mlock; |
903 | |||
904 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
789 | goto out_unmap; | 905 | goto out_unmap; |
790 | } | ||
791 | } | 906 | } |
792 | if (!(flags & TTU_IGNORE_ACCESS)) { | 907 | if (!(flags & TTU_IGNORE_ACCESS)) { |
793 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 908 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
@@ -809,9 +924,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
809 | 924 | ||
810 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { | 925 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
811 | if (PageAnon(page)) | 926 | if (PageAnon(page)) |
812 | dec_mm_counter(mm, anon_rss); | 927 | dec_mm_counter(mm, MM_ANONPAGES); |
813 | else | 928 | else |
814 | dec_mm_counter(mm, file_rss); | 929 | dec_mm_counter(mm, MM_FILEPAGES); |
815 | set_pte_at(mm, address, pte, | 930 | set_pte_at(mm, address, pte, |
816 | swp_entry_to_pte(make_hwpoison_entry(page))); | 931 | swp_entry_to_pte(make_hwpoison_entry(page))); |
817 | } else if (PageAnon(page)) { | 932 | } else if (PageAnon(page)) { |
@@ -822,14 +937,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
822 | * Store the swap location in the pte. | 937 | * Store the swap location in the pte. |
823 | * See handle_pte_fault() ... | 938 | * See handle_pte_fault() ... |
824 | */ | 939 | */ |
825 | swap_duplicate(entry); | 940 | if (swap_duplicate(entry) < 0) { |
941 | set_pte_at(mm, address, pte, pteval); | ||
942 | ret = SWAP_FAIL; | ||
943 | goto out_unmap; | ||
944 | } | ||
826 | if (list_empty(&mm->mmlist)) { | 945 | if (list_empty(&mm->mmlist)) { |
827 | spin_lock(&mmlist_lock); | 946 | spin_lock(&mmlist_lock); |
828 | if (list_empty(&mm->mmlist)) | 947 | if (list_empty(&mm->mmlist)) |
829 | list_add(&mm->mmlist, &init_mm.mmlist); | 948 | list_add(&mm->mmlist, &init_mm.mmlist); |
830 | spin_unlock(&mmlist_lock); | 949 | spin_unlock(&mmlist_lock); |
831 | } | 950 | } |
832 | dec_mm_counter(mm, anon_rss); | 951 | dec_mm_counter(mm, MM_ANONPAGES); |
952 | inc_mm_counter(mm, MM_SWAPENTS); | ||
833 | } else if (PAGE_MIGRATION) { | 953 | } else if (PAGE_MIGRATION) { |
834 | /* | 954 | /* |
835 | * Store the pfn of the page in a special migration | 955 | * Store the pfn of the page in a special migration |
@@ -847,8 +967,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
847 | entry = make_migration_entry(page, pte_write(pteval)); | 967 | entry = make_migration_entry(page, pte_write(pteval)); |
848 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 968 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
849 | } else | 969 | } else |
850 | dec_mm_counter(mm, file_rss); | 970 | dec_mm_counter(mm, MM_FILEPAGES); |
851 | |||
852 | 971 | ||
853 | page_remove_rmap(page); | 972 | page_remove_rmap(page); |
854 | page_cache_release(page); | 973 | page_cache_release(page); |
@@ -857,6 +976,27 @@ out_unmap: | |||
857 | pte_unmap_unlock(pte, ptl); | 976 | pte_unmap_unlock(pte, ptl); |
858 | out: | 977 | out: |
859 | return ret; | 978 | return ret; |
979 | |||
980 | out_mlock: | ||
981 | pte_unmap_unlock(pte, ptl); | ||
982 | |||
983 | |||
984 | /* | ||
985 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | ||
986 | * unstable result and race. Plus, We can't wait here because | ||
987 | * we now hold anon_vma->lock or mapping->i_mmap_lock. | ||
988 | * if trylock failed, the page remain in evictable lru and later | ||
989 | * vmscan could retry to move the page to unevictable lru if the | ||
990 | * page is actually mlocked. | ||
991 | */ | ||
992 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
993 | if (vma->vm_flags & VM_LOCKED) { | ||
994 | mlock_vma_page(page); | ||
995 | ret = SWAP_MLOCK; | ||
996 | } | ||
997 | up_read(&vma->vm_mm->mmap_sem); | ||
998 | } | ||
999 | return ret; | ||
860 | } | 1000 | } |
861 | 1001 | ||
862 | /* | 1002 | /* |
@@ -922,11 +1062,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
922 | return ret; | 1062 | return ret; |
923 | 1063 | ||
924 | /* | 1064 | /* |
925 | * MLOCK_PAGES => feature is configured. | 1065 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, |
926 | * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, | ||
927 | * keep the sem while scanning the cluster for mlocking pages. | 1066 | * keep the sem while scanning the cluster for mlocking pages. |
928 | */ | 1067 | */ |
929 | if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { | 1068 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { |
930 | locked_vma = (vma->vm_flags & VM_LOCKED); | 1069 | locked_vma = (vma->vm_flags & VM_LOCKED); |
931 | if (!locked_vma) | 1070 | if (!locked_vma) |
932 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ | 1071 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ |
@@ -967,7 +1106,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
967 | 1106 | ||
968 | page_remove_rmap(page); | 1107 | page_remove_rmap(page); |
969 | page_cache_release(page); | 1108 | page_cache_release(page); |
970 | dec_mm_counter(mm, file_rss); | 1109 | dec_mm_counter(mm, MM_FILEPAGES); |
971 | (*mapcount)--; | 1110 | (*mapcount)--; |
972 | } | 1111 | } |
973 | pte_unmap_unlock(pte - 1, ptl); | 1112 | pte_unmap_unlock(pte - 1, ptl); |
@@ -976,29 +1115,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
976 | return ret; | 1115 | return ret; |
977 | } | 1116 | } |
978 | 1117 | ||
979 | /* | ||
980 | * common handling for pages mapped in VM_LOCKED vmas | ||
981 | */ | ||
982 | static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) | ||
983 | { | ||
984 | int mlocked = 0; | ||
985 | |||
986 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
987 | if (vma->vm_flags & VM_LOCKED) { | ||
988 | mlock_vma_page(page); | ||
989 | mlocked++; /* really mlocked the page */ | ||
990 | } | ||
991 | up_read(&vma->vm_mm->mmap_sem); | ||
992 | } | ||
993 | return mlocked; | ||
994 | } | ||
995 | |||
996 | /** | 1118 | /** |
997 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based | 1119 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based |
998 | * rmap method | 1120 | * rmap method |
999 | * @page: the page to unmap/unlock | 1121 | * @page: the page to unmap/unlock |
1000 | * @unlock: request for unlock rather than unmap [unlikely] | 1122 | * @flags: action and flags |
1001 | * @migration: unmapping for migration - ignored if @unlock | ||
1002 | * | 1123 | * |
1003 | * Find all the mappings of a page using the mapping pointer and the vma chains | 1124 | * Find all the mappings of a page using the mapping pointer and the vma chains |
1004 | * contained in the anon_vma struct it points to. | 1125 | * contained in the anon_vma struct it points to. |
@@ -1013,43 +1134,24 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) | |||
1013 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | 1134 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) |
1014 | { | 1135 | { |
1015 | struct anon_vma *anon_vma; | 1136 | struct anon_vma *anon_vma; |
1016 | struct vm_area_struct *vma; | 1137 | struct anon_vma_chain *avc; |
1017 | unsigned int mlocked = 0; | ||
1018 | int ret = SWAP_AGAIN; | 1138 | int ret = SWAP_AGAIN; |
1019 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
1020 | |||
1021 | if (MLOCK_PAGES && unlikely(unlock)) | ||
1022 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
1023 | 1139 | ||
1024 | anon_vma = page_lock_anon_vma(page); | 1140 | anon_vma = page_lock_anon_vma(page); |
1025 | if (!anon_vma) | 1141 | if (!anon_vma) |
1026 | return ret; | 1142 | return ret; |
1027 | 1143 | ||
1028 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1144 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
1029 | if (MLOCK_PAGES && unlikely(unlock)) { | 1145 | struct vm_area_struct *vma = avc->vma; |
1030 | if (!((vma->vm_flags & VM_LOCKED) && | 1146 | unsigned long address = vma_address(page, vma); |
1031 | page_mapped_in_vma(page, vma))) | 1147 | if (address == -EFAULT) |
1032 | continue; /* must visit all unlocked vmas */ | 1148 | continue; |
1033 | ret = SWAP_MLOCK; /* saw at least one mlocked vma */ | 1149 | ret = try_to_unmap_one(page, vma, address, flags); |
1034 | } else { | 1150 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1035 | ret = try_to_unmap_one(page, vma, flags); | 1151 | break; |
1036 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
1037 | break; | ||
1038 | } | ||
1039 | if (ret == SWAP_MLOCK) { | ||
1040 | mlocked = try_to_mlock_page(page, vma); | ||
1041 | if (mlocked) | ||
1042 | break; /* stop if actually mlocked page */ | ||
1043 | } | ||
1044 | } | 1152 | } |
1045 | 1153 | ||
1046 | page_unlock_anon_vma(anon_vma); | 1154 | page_unlock_anon_vma(anon_vma); |
1047 | |||
1048 | if (mlocked) | ||
1049 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
1050 | else if (ret == SWAP_MLOCK) | ||
1051 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
1052 | |||
1053 | return ret; | 1155 | return ret; |
1054 | } | 1156 | } |
1055 | 1157 | ||
@@ -1079,48 +1181,30 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1079 | unsigned long max_nl_cursor = 0; | 1181 | unsigned long max_nl_cursor = 0; |
1080 | unsigned long max_nl_size = 0; | 1182 | unsigned long max_nl_size = 0; |
1081 | unsigned int mapcount; | 1183 | unsigned int mapcount; |
1082 | unsigned int mlocked = 0; | ||
1083 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
1084 | |||
1085 | if (MLOCK_PAGES && unlikely(unlock)) | ||
1086 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
1087 | 1184 | ||
1088 | spin_lock(&mapping->i_mmap_lock); | 1185 | spin_lock(&mapping->i_mmap_lock); |
1089 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1186 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1090 | if (MLOCK_PAGES && unlikely(unlock)) { | 1187 | unsigned long address = vma_address(page, vma); |
1091 | if (!((vma->vm_flags & VM_LOCKED) && | 1188 | if (address == -EFAULT) |
1092 | page_mapped_in_vma(page, vma))) | 1189 | continue; |
1093 | continue; /* must visit all vmas */ | 1190 | ret = try_to_unmap_one(page, vma, address, flags); |
1094 | ret = SWAP_MLOCK; | 1191 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1095 | } else { | 1192 | goto out; |
1096 | ret = try_to_unmap_one(page, vma, flags); | ||
1097 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
1098 | goto out; | ||
1099 | } | ||
1100 | if (ret == SWAP_MLOCK) { | ||
1101 | mlocked = try_to_mlock_page(page, vma); | ||
1102 | if (mlocked) | ||
1103 | break; /* stop if actually mlocked page */ | ||
1104 | } | ||
1105 | } | 1193 | } |
1106 | 1194 | ||
1107 | if (mlocked) | 1195 | if (list_empty(&mapping->i_mmap_nonlinear)) |
1108 | goto out; | 1196 | goto out; |
1109 | 1197 | ||
1110 | if (list_empty(&mapping->i_mmap_nonlinear)) | 1198 | /* |
1199 | * We don't bother to try to find the munlocked page in nonlinears. | ||
1200 | * It's costly. Instead, later, page reclaim logic may call | ||
1201 | * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. | ||
1202 | */ | ||
1203 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
1111 | goto out; | 1204 | goto out; |
1112 | 1205 | ||
1113 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1206 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1114 | shared.vm_set.list) { | 1207 | shared.vm_set.list) { |
1115 | if (MLOCK_PAGES && unlikely(unlock)) { | ||
1116 | if (!(vma->vm_flags & VM_LOCKED)) | ||
1117 | continue; /* must visit all vmas */ | ||
1118 | ret = SWAP_MLOCK; /* leave mlocked == 0 */ | ||
1119 | goto out; /* no need to look further */ | ||
1120 | } | ||
1121 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && | ||
1122 | (vma->vm_flags & VM_LOCKED)) | ||
1123 | continue; | ||
1124 | cursor = (unsigned long) vma->vm_private_data; | 1208 | cursor = (unsigned long) vma->vm_private_data; |
1125 | if (cursor > max_nl_cursor) | 1209 | if (cursor > max_nl_cursor) |
1126 | max_nl_cursor = cursor; | 1210 | max_nl_cursor = cursor; |
@@ -1153,16 +1237,12 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1153 | do { | 1237 | do { |
1154 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1238 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1155 | shared.vm_set.list) { | 1239 | shared.vm_set.list) { |
1156 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && | ||
1157 | (vma->vm_flags & VM_LOCKED)) | ||
1158 | continue; | ||
1159 | cursor = (unsigned long) vma->vm_private_data; | 1240 | cursor = (unsigned long) vma->vm_private_data; |
1160 | while ( cursor < max_nl_cursor && | 1241 | while ( cursor < max_nl_cursor && |
1161 | cursor < vma->vm_end - vma->vm_start) { | 1242 | cursor < vma->vm_end - vma->vm_start) { |
1162 | ret = try_to_unmap_cluster(cursor, &mapcount, | 1243 | if (try_to_unmap_cluster(cursor, &mapcount, |
1163 | vma, page); | 1244 | vma, page) == SWAP_MLOCK) |
1164 | if (ret == SWAP_MLOCK) | 1245 | ret = SWAP_MLOCK; |
1165 | mlocked = 2; /* to return below */ | ||
1166 | cursor += CLUSTER_SIZE; | 1246 | cursor += CLUSTER_SIZE; |
1167 | vma->vm_private_data = (void *) cursor; | 1247 | vma->vm_private_data = (void *) cursor; |
1168 | if ((int)mapcount <= 0) | 1248 | if ((int)mapcount <= 0) |
@@ -1183,10 +1263,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1183 | vma->vm_private_data = NULL; | 1263 | vma->vm_private_data = NULL; |
1184 | out: | 1264 | out: |
1185 | spin_unlock(&mapping->i_mmap_lock); | 1265 | spin_unlock(&mapping->i_mmap_lock); |
1186 | if (mlocked) | ||
1187 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
1188 | else if (ret == SWAP_MLOCK) | ||
1189 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
1190 | return ret; | 1266 | return ret; |
1191 | } | 1267 | } |
1192 | 1268 | ||
@@ -1210,7 +1286,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1210 | 1286 | ||
1211 | BUG_ON(!PageLocked(page)); | 1287 | BUG_ON(!PageLocked(page)); |
1212 | 1288 | ||
1213 | if (PageAnon(page)) | 1289 | if (unlikely(PageKsm(page))) |
1290 | ret = try_to_unmap_ksm(page, flags); | ||
1291 | else if (PageAnon(page)) | ||
1214 | ret = try_to_unmap_anon(page, flags); | 1292 | ret = try_to_unmap_anon(page, flags); |
1215 | else | 1293 | else |
1216 | ret = try_to_unmap_file(page, flags); | 1294 | ret = try_to_unmap_file(page, flags); |
@@ -1229,17 +1307,99 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1229 | * | 1307 | * |
1230 | * Return values are: | 1308 | * Return values are: |
1231 | * | 1309 | * |
1232 | * SWAP_SUCCESS - no vma's holding page mlocked. | 1310 | * SWAP_AGAIN - no vma is holding page mlocked, or, |
1233 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem | 1311 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem |
1312 | * SWAP_FAIL - page cannot be located at present | ||
1234 | * SWAP_MLOCK - page is now mlocked. | 1313 | * SWAP_MLOCK - page is now mlocked. |
1235 | */ | 1314 | */ |
1236 | int try_to_munlock(struct page *page) | 1315 | int try_to_munlock(struct page *page) |
1237 | { | 1316 | { |
1238 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | 1317 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); |
1239 | 1318 | ||
1240 | if (PageAnon(page)) | 1319 | if (unlikely(PageKsm(page))) |
1320 | return try_to_unmap_ksm(page, TTU_MUNLOCK); | ||
1321 | else if (PageAnon(page)) | ||
1241 | return try_to_unmap_anon(page, TTU_MUNLOCK); | 1322 | return try_to_unmap_anon(page, TTU_MUNLOCK); |
1242 | else | 1323 | else |
1243 | return try_to_unmap_file(page, TTU_MUNLOCK); | 1324 | return try_to_unmap_file(page, TTU_MUNLOCK); |
1244 | } | 1325 | } |
1245 | 1326 | ||
1327 | #ifdef CONFIG_MIGRATION | ||
1328 | /* | ||
1329 | * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): | ||
1330 | * Called by migrate.c to remove migration ptes, but might be used more later. | ||
1331 | */ | ||
1332 | static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | ||
1333 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1334 | { | ||
1335 | struct anon_vma *anon_vma; | ||
1336 | struct anon_vma_chain *avc; | ||
1337 | int ret = SWAP_AGAIN; | ||
1338 | |||
1339 | /* | ||
1340 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() | ||
1341 | * because that depends on page_mapped(); but not all its usages | ||
1342 | * are holding mmap_sem, which also gave the necessary guarantee | ||
1343 | * (that this anon_vma's slab has not already been destroyed). | ||
1344 | * This needs to be reviewed later: avoiding page_lock_anon_vma() | ||
1345 | * is risky, and currently limits the usefulness of rmap_walk(). | ||
1346 | */ | ||
1347 | anon_vma = page_anon_vma(page); | ||
1348 | if (!anon_vma) | ||
1349 | return ret; | ||
1350 | spin_lock(&anon_vma->lock); | ||
1351 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | ||
1352 | struct vm_area_struct *vma = avc->vma; | ||
1353 | unsigned long address = vma_address(page, vma); | ||
1354 | if (address == -EFAULT) | ||
1355 | continue; | ||
1356 | ret = rmap_one(page, vma, address, arg); | ||
1357 | if (ret != SWAP_AGAIN) | ||
1358 | break; | ||
1359 | } | ||
1360 | spin_unlock(&anon_vma->lock); | ||
1361 | return ret; | ||
1362 | } | ||
1363 | |||
1364 | static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | ||
1365 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1366 | { | ||
1367 | struct address_space *mapping = page->mapping; | ||
1368 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1369 | struct vm_area_struct *vma; | ||
1370 | struct prio_tree_iter iter; | ||
1371 | int ret = SWAP_AGAIN; | ||
1372 | |||
1373 | if (!mapping) | ||
1374 | return ret; | ||
1375 | spin_lock(&mapping->i_mmap_lock); | ||
1376 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
1377 | unsigned long address = vma_address(page, vma); | ||
1378 | if (address == -EFAULT) | ||
1379 | continue; | ||
1380 | ret = rmap_one(page, vma, address, arg); | ||
1381 | if (ret != SWAP_AGAIN) | ||
1382 | break; | ||
1383 | } | ||
1384 | /* | ||
1385 | * No nonlinear handling: being always shared, nonlinear vmas | ||
1386 | * never contain migration ptes. Decide what to do about this | ||
1387 | * limitation to linear when we need rmap_walk() on nonlinear. | ||
1388 | */ | ||
1389 | spin_unlock(&mapping->i_mmap_lock); | ||
1390 | return ret; | ||
1391 | } | ||
1392 | |||
1393 | int rmap_walk(struct page *page, int (*rmap_one)(struct page *, | ||
1394 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1395 | { | ||
1396 | VM_BUG_ON(!PageLocked(page)); | ||
1397 | |||
1398 | if (unlikely(PageKsm(page))) | ||
1399 | return rmap_walk_ksm(page, rmap_one, arg); | ||
1400 | else if (PageAnon(page)) | ||
1401 | return rmap_walk_anon(page, rmap_one, arg); | ||
1402 | else | ||
1403 | return rmap_walk_file(page, rmap_one, arg); | ||
1404 | } | ||
1405 | #endif /* CONFIG_MIGRATION */ | ||