diff options
Diffstat (limited to 'mm/rmap.c')
-rw-r--r-- | mm/rmap.c | 568 |
1 files changed, 372 insertions, 196 deletions
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/swapops.h> | 49 | #include <linux/swapops.h> |
50 | #include <linux/slab.h> | 50 | #include <linux/slab.h> |
51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
52 | #include <linux/ksm.h> | ||
52 | #include <linux/rmap.h> | 53 | #include <linux/rmap.h> |
53 | #include <linux/rcupdate.h> | 54 | #include <linux/rcupdate.h> |
54 | #include <linux/module.h> | 55 | #include <linux/module.h> |
@@ -61,17 +62,28 @@ | |||
61 | #include "internal.h" | 62 | #include "internal.h" |
62 | 63 | ||
63 | static struct kmem_cache *anon_vma_cachep; | 64 | static struct kmem_cache *anon_vma_cachep; |
65 | static struct kmem_cache *anon_vma_chain_cachep; | ||
64 | 66 | ||
65 | static inline struct anon_vma *anon_vma_alloc(void) | 67 | static inline struct anon_vma *anon_vma_alloc(void) |
66 | { | 68 | { |
67 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | 69 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); |
68 | } | 70 | } |
69 | 71 | ||
70 | static inline void anon_vma_free(struct anon_vma *anon_vma) | 72 | void anon_vma_free(struct anon_vma *anon_vma) |
71 | { | 73 | { |
72 | kmem_cache_free(anon_vma_cachep, anon_vma); | 74 | kmem_cache_free(anon_vma_cachep, anon_vma); |
73 | } | 75 | } |
74 | 76 | ||
77 | static inline struct anon_vma_chain *anon_vma_chain_alloc(void) | ||
78 | { | ||
79 | return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); | ||
80 | } | ||
81 | |||
82 | void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | ||
83 | { | ||
84 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); | ||
85 | } | ||
86 | |||
75 | /** | 87 | /** |
76 | * anon_vma_prepare - attach an anon_vma to a memory region | 88 | * anon_vma_prepare - attach an anon_vma to a memory region |
77 | * @vma: the memory region in question | 89 | * @vma: the memory region in question |
@@ -102,87 +114,167 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
102 | int anon_vma_prepare(struct vm_area_struct *vma) | 114 | int anon_vma_prepare(struct vm_area_struct *vma) |
103 | { | 115 | { |
104 | struct anon_vma *anon_vma = vma->anon_vma; | 116 | struct anon_vma *anon_vma = vma->anon_vma; |
117 | struct anon_vma_chain *avc; | ||
105 | 118 | ||
106 | might_sleep(); | 119 | might_sleep(); |
107 | if (unlikely(!anon_vma)) { | 120 | if (unlikely(!anon_vma)) { |
108 | struct mm_struct *mm = vma->vm_mm; | 121 | struct mm_struct *mm = vma->vm_mm; |
109 | struct anon_vma *allocated; | 122 | struct anon_vma *allocated; |
110 | 123 | ||
124 | avc = anon_vma_chain_alloc(); | ||
125 | if (!avc) | ||
126 | goto out_enomem; | ||
127 | |||
111 | anon_vma = find_mergeable_anon_vma(vma); | 128 | anon_vma = find_mergeable_anon_vma(vma); |
112 | allocated = NULL; | 129 | allocated = NULL; |
113 | if (!anon_vma) { | 130 | if (!anon_vma) { |
114 | anon_vma = anon_vma_alloc(); | 131 | anon_vma = anon_vma_alloc(); |
115 | if (unlikely(!anon_vma)) | 132 | if (unlikely(!anon_vma)) |
116 | return -ENOMEM; | 133 | goto out_enomem_free_avc; |
117 | allocated = anon_vma; | 134 | allocated = anon_vma; |
118 | } | 135 | } |
119 | spin_lock(&anon_vma->lock); | ||
120 | 136 | ||
137 | spin_lock(&anon_vma->lock); | ||
121 | /* page_table_lock to protect against threads */ | 138 | /* page_table_lock to protect against threads */ |
122 | spin_lock(&mm->page_table_lock); | 139 | spin_lock(&mm->page_table_lock); |
123 | if (likely(!vma->anon_vma)) { | 140 | if (likely(!vma->anon_vma)) { |
124 | vma->anon_vma = anon_vma; | 141 | vma->anon_vma = anon_vma; |
125 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 142 | avc->anon_vma = anon_vma; |
143 | avc->vma = vma; | ||
144 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
145 | list_add(&avc->same_anon_vma, &anon_vma->head); | ||
126 | allocated = NULL; | 146 | allocated = NULL; |
147 | avc = NULL; | ||
127 | } | 148 | } |
128 | spin_unlock(&mm->page_table_lock); | 149 | spin_unlock(&mm->page_table_lock); |
129 | |||
130 | spin_unlock(&anon_vma->lock); | 150 | spin_unlock(&anon_vma->lock); |
151 | |||
131 | if (unlikely(allocated)) | 152 | if (unlikely(allocated)) |
132 | anon_vma_free(allocated); | 153 | anon_vma_free(allocated); |
154 | if (unlikely(avc)) | ||
155 | anon_vma_chain_free(avc); | ||
133 | } | 156 | } |
134 | return 0; | 157 | return 0; |
158 | |||
159 | out_enomem_free_avc: | ||
160 | anon_vma_chain_free(avc); | ||
161 | out_enomem: | ||
162 | return -ENOMEM; | ||
135 | } | 163 | } |
136 | 164 | ||
137 | void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) | 165 | static void anon_vma_chain_link(struct vm_area_struct *vma, |
166 | struct anon_vma_chain *avc, | ||
167 | struct anon_vma *anon_vma) | ||
138 | { | 168 | { |
139 | BUG_ON(vma->anon_vma != next->anon_vma); | 169 | avc->vma = vma; |
140 | list_del(&next->anon_vma_node); | 170 | avc->anon_vma = anon_vma; |
171 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
172 | |||
173 | spin_lock(&anon_vma->lock); | ||
174 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
175 | spin_unlock(&anon_vma->lock); | ||
141 | } | 176 | } |
142 | 177 | ||
143 | void __anon_vma_link(struct vm_area_struct *vma) | 178 | /* |
179 | * Attach the anon_vmas from src to dst. | ||
180 | * Returns 0 on success, -ENOMEM on failure. | ||
181 | */ | ||
182 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | ||
144 | { | 183 | { |
145 | struct anon_vma *anon_vma = vma->anon_vma; | 184 | struct anon_vma_chain *avc, *pavc; |
146 | 185 | ||
147 | if (anon_vma) | 186 | list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { |
148 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 187 | avc = anon_vma_chain_alloc(); |
188 | if (!avc) | ||
189 | goto enomem_failure; | ||
190 | anon_vma_chain_link(dst, avc, pavc->anon_vma); | ||
191 | } | ||
192 | return 0; | ||
193 | |||
194 | enomem_failure: | ||
195 | unlink_anon_vmas(dst); | ||
196 | return -ENOMEM; | ||
149 | } | 197 | } |
150 | 198 | ||
151 | void anon_vma_link(struct vm_area_struct *vma) | 199 | /* |
200 | * Attach vma to its own anon_vma, as well as to the anon_vmas that | ||
201 | * the corresponding VMA in the parent process is attached to. | ||
202 | * Returns 0 on success, non-zero on failure. | ||
203 | */ | ||
204 | int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | ||
152 | { | 205 | { |
153 | struct anon_vma *anon_vma = vma->anon_vma; | 206 | struct anon_vma_chain *avc; |
207 | struct anon_vma *anon_vma; | ||
154 | 208 | ||
155 | if (anon_vma) { | 209 | /* Don't bother if the parent process has no anon_vma here. */ |
156 | spin_lock(&anon_vma->lock); | 210 | if (!pvma->anon_vma) |
157 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 211 | return 0; |
158 | spin_unlock(&anon_vma->lock); | 212 | |
159 | } | 213 | /* |
214 | * First, attach the new VMA to the parent VMA's anon_vmas, | ||
215 | * so rmap can find non-COWed pages in child processes. | ||
216 | */ | ||
217 | if (anon_vma_clone(vma, pvma)) | ||
218 | return -ENOMEM; | ||
219 | |||
220 | /* Then add our own anon_vma. */ | ||
221 | anon_vma = anon_vma_alloc(); | ||
222 | if (!anon_vma) | ||
223 | goto out_error; | ||
224 | avc = anon_vma_chain_alloc(); | ||
225 | if (!avc) | ||
226 | goto out_error_free_anon_vma; | ||
227 | anon_vma_chain_link(vma, avc, anon_vma); | ||
228 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | ||
229 | vma->anon_vma = anon_vma; | ||
230 | |||
231 | return 0; | ||
232 | |||
233 | out_error_free_anon_vma: | ||
234 | anon_vma_free(anon_vma); | ||
235 | out_error: | ||
236 | unlink_anon_vmas(vma); | ||
237 | return -ENOMEM; | ||
160 | } | 238 | } |
161 | 239 | ||
162 | void anon_vma_unlink(struct vm_area_struct *vma) | 240 | static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) |
163 | { | 241 | { |
164 | struct anon_vma *anon_vma = vma->anon_vma; | 242 | struct anon_vma *anon_vma = anon_vma_chain->anon_vma; |
165 | int empty; | 243 | int empty; |
166 | 244 | ||
245 | /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ | ||
167 | if (!anon_vma) | 246 | if (!anon_vma) |
168 | return; | 247 | return; |
169 | 248 | ||
170 | spin_lock(&anon_vma->lock); | 249 | spin_lock(&anon_vma->lock); |
171 | list_del(&vma->anon_vma_node); | 250 | list_del(&anon_vma_chain->same_anon_vma); |
172 | 251 | ||
173 | /* We must garbage collect the anon_vma if it's empty */ | 252 | /* We must garbage collect the anon_vma if it's empty */ |
174 | empty = list_empty(&anon_vma->head); | 253 | empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); |
175 | spin_unlock(&anon_vma->lock); | 254 | spin_unlock(&anon_vma->lock); |
176 | 255 | ||
177 | if (empty) | 256 | if (empty) |
178 | anon_vma_free(anon_vma); | 257 | anon_vma_free(anon_vma); |
179 | } | 258 | } |
180 | 259 | ||
260 | void unlink_anon_vmas(struct vm_area_struct *vma) | ||
261 | { | ||
262 | struct anon_vma_chain *avc, *next; | ||
263 | |||
264 | /* Unlink each anon_vma chained to the VMA. */ | ||
265 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | ||
266 | anon_vma_unlink(avc); | ||
267 | list_del(&avc->same_vma); | ||
268 | anon_vma_chain_free(avc); | ||
269 | } | ||
270 | } | ||
271 | |||
181 | static void anon_vma_ctor(void *data) | 272 | static void anon_vma_ctor(void *data) |
182 | { | 273 | { |
183 | struct anon_vma *anon_vma = data; | 274 | struct anon_vma *anon_vma = data; |
184 | 275 | ||
185 | spin_lock_init(&anon_vma->lock); | 276 | spin_lock_init(&anon_vma->lock); |
277 | ksm_refcount_init(anon_vma); | ||
186 | INIT_LIST_HEAD(&anon_vma->head); | 278 | INIT_LIST_HEAD(&anon_vma->head); |
187 | } | 279 | } |
188 | 280 | ||
@@ -190,6 +282,7 @@ void __init anon_vma_init(void) | |||
190 | { | 282 | { |
191 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), | 283 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), |
192 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); | 284 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); |
285 | anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); | ||
193 | } | 286 | } |
194 | 287 | ||
195 | /* | 288 | /* |
@@ -202,8 +295,8 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
202 | unsigned long anon_mapping; | 295 | unsigned long anon_mapping; |
203 | 296 | ||
204 | rcu_read_lock(); | 297 | rcu_read_lock(); |
205 | anon_mapping = (unsigned long) page->mapping; | 298 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); |
206 | if (!(anon_mapping & PAGE_MAPPING_ANON)) | 299 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
207 | goto out; | 300 | goto out; |
208 | if (!page_mapped(page)) | 301 | if (!page_mapped(page)) |
209 | goto out; | 302 | goto out; |
@@ -243,15 +336,13 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
243 | 336 | ||
244 | /* | 337 | /* |
245 | * At what user virtual address is page expected in vma? | 338 | * At what user virtual address is page expected in vma? |
246 | * checking that the page matches the vma. | 339 | * Caller should check the page is actually part of the vma. |
247 | */ | 340 | */ |
248 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 341 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
249 | { | 342 | { |
250 | if (PageAnon(page)) { | 343 | if (PageAnon(page)) |
251 | if ((void *)vma->anon_vma != | 344 | ; |
252 | (void *)page->mapping - PAGE_MAPPING_ANON) | 345 | else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { |
253 | return -EFAULT; | ||
254 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { | ||
255 | if (!vma->vm_file || | 346 | if (!vma->vm_file || |
256 | vma->vm_file->f_mapping != page->mapping) | 347 | vma->vm_file->f_mapping != page->mapping) |
257 | return -EFAULT; | 348 | return -EFAULT; |
@@ -337,21 +428,15 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
337 | * Subfunctions of page_referenced: page_referenced_one called | 428 | * Subfunctions of page_referenced: page_referenced_one called |
338 | * repeatedly from either page_referenced_anon or page_referenced_file. | 429 | * repeatedly from either page_referenced_anon or page_referenced_file. |
339 | */ | 430 | */ |
340 | static int page_referenced_one(struct page *page, | 431 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, |
341 | struct vm_area_struct *vma, | 432 | unsigned long address, unsigned int *mapcount, |
342 | unsigned int *mapcount, | 433 | unsigned long *vm_flags) |
343 | unsigned long *vm_flags) | ||
344 | { | 434 | { |
345 | struct mm_struct *mm = vma->vm_mm; | 435 | struct mm_struct *mm = vma->vm_mm; |
346 | unsigned long address; | ||
347 | pte_t *pte; | 436 | pte_t *pte; |
348 | spinlock_t *ptl; | 437 | spinlock_t *ptl; |
349 | int referenced = 0; | 438 | int referenced = 0; |
350 | 439 | ||
351 | address = vma_address(page, vma); | ||
352 | if (address == -EFAULT) | ||
353 | goto out; | ||
354 | |||
355 | pte = page_check_address(page, mm, address, &ptl, 0); | 440 | pte = page_check_address(page, mm, address, &ptl, 0); |
356 | if (!pte) | 441 | if (!pte) |
357 | goto out; | 442 | goto out; |
@@ -388,9 +473,10 @@ static int page_referenced_one(struct page *page, | |||
388 | out_unmap: | 473 | out_unmap: |
389 | (*mapcount)--; | 474 | (*mapcount)--; |
390 | pte_unmap_unlock(pte, ptl); | 475 | pte_unmap_unlock(pte, ptl); |
391 | out: | 476 | |
392 | if (referenced) | 477 | if (referenced) |
393 | *vm_flags |= vma->vm_flags; | 478 | *vm_flags |= vma->vm_flags; |
479 | out: | ||
394 | return referenced; | 480 | return referenced; |
395 | } | 481 | } |
396 | 482 | ||
@@ -400,7 +486,7 @@ static int page_referenced_anon(struct page *page, | |||
400 | { | 486 | { |
401 | unsigned int mapcount; | 487 | unsigned int mapcount; |
402 | struct anon_vma *anon_vma; | 488 | struct anon_vma *anon_vma; |
403 | struct vm_area_struct *vma; | 489 | struct anon_vma_chain *avc; |
404 | int referenced = 0; | 490 | int referenced = 0; |
405 | 491 | ||
406 | anon_vma = page_lock_anon_vma(page); | 492 | anon_vma = page_lock_anon_vma(page); |
@@ -408,7 +494,11 @@ static int page_referenced_anon(struct page *page, | |||
408 | return referenced; | 494 | return referenced; |
409 | 495 | ||
410 | mapcount = page_mapcount(page); | 496 | mapcount = page_mapcount(page); |
411 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 497 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
498 | struct vm_area_struct *vma = avc->vma; | ||
499 | unsigned long address = vma_address(page, vma); | ||
500 | if (address == -EFAULT) | ||
501 | continue; | ||
412 | /* | 502 | /* |
413 | * If we are reclaiming on behalf of a cgroup, skip | 503 | * If we are reclaiming on behalf of a cgroup, skip |
414 | * counting on behalf of references from different | 504 | * counting on behalf of references from different |
@@ -416,7 +506,7 @@ static int page_referenced_anon(struct page *page, | |||
416 | */ | 506 | */ |
417 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 507 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
418 | continue; | 508 | continue; |
419 | referenced += page_referenced_one(page, vma, | 509 | referenced += page_referenced_one(page, vma, address, |
420 | &mapcount, vm_flags); | 510 | &mapcount, vm_flags); |
421 | if (!mapcount) | 511 | if (!mapcount) |
422 | break; | 512 | break; |
@@ -474,6 +564,9 @@ static int page_referenced_file(struct page *page, | |||
474 | mapcount = page_mapcount(page); | 564 | mapcount = page_mapcount(page); |
475 | 565 | ||
476 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 566 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
567 | unsigned long address = vma_address(page, vma); | ||
568 | if (address == -EFAULT) | ||
569 | continue; | ||
477 | /* | 570 | /* |
478 | * If we are reclaiming on behalf of a cgroup, skip | 571 | * If we are reclaiming on behalf of a cgroup, skip |
479 | * counting on behalf of references from different | 572 | * counting on behalf of references from different |
@@ -481,7 +574,7 @@ static int page_referenced_file(struct page *page, | |||
481 | */ | 574 | */ |
482 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 575 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
483 | continue; | 576 | continue; |
484 | referenced += page_referenced_one(page, vma, | 577 | referenced += page_referenced_one(page, vma, address, |
485 | &mapcount, vm_flags); | 578 | &mapcount, vm_flags); |
486 | if (!mapcount) | 579 | if (!mapcount) |
487 | break; | 580 | break; |
@@ -507,46 +600,44 @@ int page_referenced(struct page *page, | |||
507 | unsigned long *vm_flags) | 600 | unsigned long *vm_flags) |
508 | { | 601 | { |
509 | int referenced = 0; | 602 | int referenced = 0; |
510 | 603 | int we_locked = 0; | |
511 | if (TestClearPageReferenced(page)) | ||
512 | referenced++; | ||
513 | 604 | ||
514 | *vm_flags = 0; | 605 | *vm_flags = 0; |
515 | if (page_mapped(page) && page->mapping) { | 606 | if (page_mapped(page) && page_rmapping(page)) { |
516 | if (PageAnon(page)) | 607 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { |
608 | we_locked = trylock_page(page); | ||
609 | if (!we_locked) { | ||
610 | referenced++; | ||
611 | goto out; | ||
612 | } | ||
613 | } | ||
614 | if (unlikely(PageKsm(page))) | ||
615 | referenced += page_referenced_ksm(page, mem_cont, | ||
616 | vm_flags); | ||
617 | else if (PageAnon(page)) | ||
517 | referenced += page_referenced_anon(page, mem_cont, | 618 | referenced += page_referenced_anon(page, mem_cont, |
518 | vm_flags); | 619 | vm_flags); |
519 | else if (is_locked) | 620 | else if (page->mapping) |
520 | referenced += page_referenced_file(page, mem_cont, | 621 | referenced += page_referenced_file(page, mem_cont, |
521 | vm_flags); | 622 | vm_flags); |
522 | else if (!trylock_page(page)) | 623 | if (we_locked) |
523 | referenced++; | ||
524 | else { | ||
525 | if (page->mapping) | ||
526 | referenced += page_referenced_file(page, | ||
527 | mem_cont, vm_flags); | ||
528 | unlock_page(page); | 624 | unlock_page(page); |
529 | } | ||
530 | } | 625 | } |
531 | 626 | out: | |
532 | if (page_test_and_clear_young(page)) | 627 | if (page_test_and_clear_young(page)) |
533 | referenced++; | 628 | referenced++; |
534 | 629 | ||
535 | return referenced; | 630 | return referenced; |
536 | } | 631 | } |
537 | 632 | ||
538 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | 633 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, |
634 | unsigned long address) | ||
539 | { | 635 | { |
540 | struct mm_struct *mm = vma->vm_mm; | 636 | struct mm_struct *mm = vma->vm_mm; |
541 | unsigned long address; | ||
542 | pte_t *pte; | 637 | pte_t *pte; |
543 | spinlock_t *ptl; | 638 | spinlock_t *ptl; |
544 | int ret = 0; | 639 | int ret = 0; |
545 | 640 | ||
546 | address = vma_address(page, vma); | ||
547 | if (address == -EFAULT) | ||
548 | goto out; | ||
549 | |||
550 | pte = page_check_address(page, mm, address, &ptl, 1); | 641 | pte = page_check_address(page, mm, address, &ptl, 1); |
551 | if (!pte) | 642 | if (!pte) |
552 | goto out; | 643 | goto out; |
@@ -578,8 +669,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
578 | 669 | ||
579 | spin_lock(&mapping->i_mmap_lock); | 670 | spin_lock(&mapping->i_mmap_lock); |
580 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 671 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
581 | if (vma->vm_flags & VM_SHARED) | 672 | if (vma->vm_flags & VM_SHARED) { |
582 | ret += page_mkclean_one(page, vma); | 673 | unsigned long address = vma_address(page, vma); |
674 | if (address == -EFAULT) | ||
675 | continue; | ||
676 | ret += page_mkclean_one(page, vma, address); | ||
677 | } | ||
583 | } | 678 | } |
584 | spin_unlock(&mapping->i_mmap_lock); | 679 | spin_unlock(&mapping->i_mmap_lock); |
585 | return ret; | 680 | return ret; |
@@ -607,27 +702,60 @@ int page_mkclean(struct page *page) | |||
607 | EXPORT_SYMBOL_GPL(page_mkclean); | 702 | EXPORT_SYMBOL_GPL(page_mkclean); |
608 | 703 | ||
609 | /** | 704 | /** |
705 | * page_move_anon_rmap - move a page to our anon_vma | ||
706 | * @page: the page to move to our anon_vma | ||
707 | * @vma: the vma the page belongs to | ||
708 | * @address: the user virtual address mapped | ||
709 | * | ||
710 | * When a page belongs exclusively to one process after a COW event, | ||
711 | * that page can be moved into the anon_vma that belongs to just that | ||
712 | * process, so the rmap code will not search the parent or sibling | ||
713 | * processes. | ||
714 | */ | ||
715 | void page_move_anon_rmap(struct page *page, | ||
716 | struct vm_area_struct *vma, unsigned long address) | ||
717 | { | ||
718 | struct anon_vma *anon_vma = vma->anon_vma; | ||
719 | |||
720 | VM_BUG_ON(!PageLocked(page)); | ||
721 | VM_BUG_ON(!anon_vma); | ||
722 | VM_BUG_ON(page->index != linear_page_index(vma, address)); | ||
723 | |||
724 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
725 | page->mapping = (struct address_space *) anon_vma; | ||
726 | } | ||
727 | |||
728 | /** | ||
610 | * __page_set_anon_rmap - setup new anonymous rmap | 729 | * __page_set_anon_rmap - setup new anonymous rmap |
611 | * @page: the page to add the mapping to | 730 | * @page: the page to add the mapping to |
612 | * @vma: the vm area in which the mapping is added | 731 | * @vma: the vm area in which the mapping is added |
613 | * @address: the user virtual address mapped | 732 | * @address: the user virtual address mapped |
733 | * @exclusive: the page is exclusively owned by the current process | ||
614 | */ | 734 | */ |
615 | static void __page_set_anon_rmap(struct page *page, | 735 | static void __page_set_anon_rmap(struct page *page, |
616 | struct vm_area_struct *vma, unsigned long address) | 736 | struct vm_area_struct *vma, unsigned long address, int exclusive) |
617 | { | 737 | { |
618 | struct anon_vma *anon_vma = vma->anon_vma; | 738 | struct anon_vma *anon_vma = vma->anon_vma; |
619 | 739 | ||
620 | BUG_ON(!anon_vma); | 740 | BUG_ON(!anon_vma); |
621 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
622 | page->mapping = (struct address_space *) anon_vma; | ||
623 | |||
624 | page->index = linear_page_index(vma, address); | ||
625 | 741 | ||
626 | /* | 742 | /* |
627 | * nr_mapped state can be updated without turning off | 743 | * If the page isn't exclusively mapped into this vma, |
628 | * interrupts because it is not modified via interrupt. | 744 | * we must use the _oldest_ possible anon_vma for the |
745 | * page mapping! | ||
746 | * | ||
747 | * So take the last AVC chain entry in the vma, which is | ||
748 | * the deepest ancestor, and use the anon_vma from that. | ||
629 | */ | 749 | */ |
630 | __inc_zone_page_state(page, NR_ANON_PAGES); | 750 | if (!exclusive) { |
751 | struct anon_vma_chain *avc; | ||
752 | avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma); | ||
753 | anon_vma = avc->anon_vma; | ||
754 | } | ||
755 | |||
756 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
757 | page->mapping = (struct address_space *) anon_vma; | ||
758 | page->index = linear_page_index(vma, address); | ||
631 | } | 759 | } |
632 | 760 | ||
633 | /** | 761 | /** |
@@ -652,9 +780,6 @@ static void __page_check_anon_rmap(struct page *page, | |||
652 | * are initially only visible via the pagetables, and the pte is locked | 780 | * are initially only visible via the pagetables, and the pte is locked |
653 | * over the call to page_add_new_anon_rmap. | 781 | * over the call to page_add_new_anon_rmap. |
654 | */ | 782 | */ |
655 | struct anon_vma *anon_vma = vma->anon_vma; | ||
656 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
657 | BUG_ON(page->mapping != (struct address_space *)anon_vma); | ||
658 | BUG_ON(page->index != linear_page_index(vma, address)); | 783 | BUG_ON(page->index != linear_page_index(vma, address)); |
659 | #endif | 784 | #endif |
660 | } | 785 | } |
@@ -665,15 +790,24 @@ static void __page_check_anon_rmap(struct page *page, | |||
665 | * @vma: the vm area in which the mapping is added | 790 | * @vma: the vm area in which the mapping is added |
666 | * @address: the user virtual address mapped | 791 | * @address: the user virtual address mapped |
667 | * | 792 | * |
668 | * The caller needs to hold the pte lock and the page must be locked. | 793 | * The caller needs to hold the pte lock, and the page must be locked in |
794 | * the anon_vma case: to serialize mapping,index checking after setting, | ||
795 | * and to ensure that PageAnon is not being upgraded racily to PageKsm | ||
796 | * (but PageKsm is never downgraded to PageAnon). | ||
669 | */ | 797 | */ |
670 | void page_add_anon_rmap(struct page *page, | 798 | void page_add_anon_rmap(struct page *page, |
671 | struct vm_area_struct *vma, unsigned long address) | 799 | struct vm_area_struct *vma, unsigned long address) |
672 | { | 800 | { |
801 | int first = atomic_inc_and_test(&page->_mapcount); | ||
802 | if (first) | ||
803 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
804 | if (unlikely(PageKsm(page))) | ||
805 | return; | ||
806 | |||
673 | VM_BUG_ON(!PageLocked(page)); | 807 | VM_BUG_ON(!PageLocked(page)); |
674 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 808 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
675 | if (atomic_inc_and_test(&page->_mapcount)) | 809 | if (first) |
676 | __page_set_anon_rmap(page, vma, address); | 810 | __page_set_anon_rmap(page, vma, address, 0); |
677 | else | 811 | else |
678 | __page_check_anon_rmap(page, vma, address); | 812 | __page_check_anon_rmap(page, vma, address); |
679 | } | 813 | } |
@@ -694,7 +828,8 @@ void page_add_new_anon_rmap(struct page *page, | |||
694 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 828 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
695 | SetPageSwapBacked(page); | 829 | SetPageSwapBacked(page); |
696 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 830 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
697 | __page_set_anon_rmap(page, vma, address); | 831 | __inc_zone_page_state(page, NR_ANON_PAGES); |
832 | __page_set_anon_rmap(page, vma, address, 1); | ||
698 | if (page_evictable(page, vma)) | 833 | if (page_evictable(page, vma)) |
699 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 834 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
700 | else | 835 | else |
@@ -711,7 +846,7 @@ void page_add_file_rmap(struct page *page) | |||
711 | { | 846 | { |
712 | if (atomic_inc_and_test(&page->_mapcount)) { | 847 | if (atomic_inc_and_test(&page->_mapcount)) { |
713 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 848 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
714 | mem_cgroup_update_mapped_file_stat(page, 1); | 849 | mem_cgroup_update_file_mapped(page, 1); |
715 | } | 850 | } |
716 | } | 851 | } |
717 | 852 | ||
@@ -743,8 +878,8 @@ void page_remove_rmap(struct page *page) | |||
743 | __dec_zone_page_state(page, NR_ANON_PAGES); | 878 | __dec_zone_page_state(page, NR_ANON_PAGES); |
744 | } else { | 879 | } else { |
745 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 880 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
881 | mem_cgroup_update_file_mapped(page, -1); | ||
746 | } | 882 | } |
747 | mem_cgroup_update_mapped_file_stat(page, -1); | ||
748 | /* | 883 | /* |
749 | * It would be tidy to reset the PageAnon mapping here, | 884 | * It would be tidy to reset the PageAnon mapping here, |
750 | * but that might overwrite a racing page_add_anon_rmap | 885 | * but that might overwrite a racing page_add_anon_rmap |
@@ -760,20 +895,15 @@ void page_remove_rmap(struct page *page) | |||
760 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 895 | * Subfunctions of try_to_unmap: try_to_unmap_one called |
761 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 896 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. |
762 | */ | 897 | */ |
763 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 898 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
764 | enum ttu_flags flags) | 899 | unsigned long address, enum ttu_flags flags) |
765 | { | 900 | { |
766 | struct mm_struct *mm = vma->vm_mm; | 901 | struct mm_struct *mm = vma->vm_mm; |
767 | unsigned long address; | ||
768 | pte_t *pte; | 902 | pte_t *pte; |
769 | pte_t pteval; | 903 | pte_t pteval; |
770 | spinlock_t *ptl; | 904 | spinlock_t *ptl; |
771 | int ret = SWAP_AGAIN; | 905 | int ret = SWAP_AGAIN; |
772 | 906 | ||
773 | address = vma_address(page, vma); | ||
774 | if (address == -EFAULT) | ||
775 | goto out; | ||
776 | |||
777 | pte = page_check_address(page, mm, address, &ptl, 0); | 907 | pte = page_check_address(page, mm, address, &ptl, 0); |
778 | if (!pte) | 908 | if (!pte) |
779 | goto out; | 909 | goto out; |
@@ -784,10 +914,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
784 | * skipped over this mm) then we should reactivate it. | 914 | * skipped over this mm) then we should reactivate it. |
785 | */ | 915 | */ |
786 | if (!(flags & TTU_IGNORE_MLOCK)) { | 916 | if (!(flags & TTU_IGNORE_MLOCK)) { |
787 | if (vma->vm_flags & VM_LOCKED) { | 917 | if (vma->vm_flags & VM_LOCKED) |
788 | ret = SWAP_MLOCK; | 918 | goto out_mlock; |
919 | |||
920 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
789 | goto out_unmap; | 921 | goto out_unmap; |
790 | } | ||
791 | } | 922 | } |
792 | if (!(flags & TTU_IGNORE_ACCESS)) { | 923 | if (!(flags & TTU_IGNORE_ACCESS)) { |
793 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 924 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
@@ -809,9 +940,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
809 | 940 | ||
810 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { | 941 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
811 | if (PageAnon(page)) | 942 | if (PageAnon(page)) |
812 | dec_mm_counter(mm, anon_rss); | 943 | dec_mm_counter(mm, MM_ANONPAGES); |
813 | else | 944 | else |
814 | dec_mm_counter(mm, file_rss); | 945 | dec_mm_counter(mm, MM_FILEPAGES); |
815 | set_pte_at(mm, address, pte, | 946 | set_pte_at(mm, address, pte, |
816 | swp_entry_to_pte(make_hwpoison_entry(page))); | 947 | swp_entry_to_pte(make_hwpoison_entry(page))); |
817 | } else if (PageAnon(page)) { | 948 | } else if (PageAnon(page)) { |
@@ -822,14 +953,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
822 | * Store the swap location in the pte. | 953 | * Store the swap location in the pte. |
823 | * See handle_pte_fault() ... | 954 | * See handle_pte_fault() ... |
824 | */ | 955 | */ |
825 | swap_duplicate(entry); | 956 | if (swap_duplicate(entry) < 0) { |
957 | set_pte_at(mm, address, pte, pteval); | ||
958 | ret = SWAP_FAIL; | ||
959 | goto out_unmap; | ||
960 | } | ||
826 | if (list_empty(&mm->mmlist)) { | 961 | if (list_empty(&mm->mmlist)) { |
827 | spin_lock(&mmlist_lock); | 962 | spin_lock(&mmlist_lock); |
828 | if (list_empty(&mm->mmlist)) | 963 | if (list_empty(&mm->mmlist)) |
829 | list_add(&mm->mmlist, &init_mm.mmlist); | 964 | list_add(&mm->mmlist, &init_mm.mmlist); |
830 | spin_unlock(&mmlist_lock); | 965 | spin_unlock(&mmlist_lock); |
831 | } | 966 | } |
832 | dec_mm_counter(mm, anon_rss); | 967 | dec_mm_counter(mm, MM_ANONPAGES); |
968 | inc_mm_counter(mm, MM_SWAPENTS); | ||
833 | } else if (PAGE_MIGRATION) { | 969 | } else if (PAGE_MIGRATION) { |
834 | /* | 970 | /* |
835 | * Store the pfn of the page in a special migration | 971 | * Store the pfn of the page in a special migration |
@@ -847,8 +983,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
847 | entry = make_migration_entry(page, pte_write(pteval)); | 983 | entry = make_migration_entry(page, pte_write(pteval)); |
848 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 984 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
849 | } else | 985 | } else |
850 | dec_mm_counter(mm, file_rss); | 986 | dec_mm_counter(mm, MM_FILEPAGES); |
851 | |||
852 | 987 | ||
853 | page_remove_rmap(page); | 988 | page_remove_rmap(page); |
854 | page_cache_release(page); | 989 | page_cache_release(page); |
@@ -857,6 +992,27 @@ out_unmap: | |||
857 | pte_unmap_unlock(pte, ptl); | 992 | pte_unmap_unlock(pte, ptl); |
858 | out: | 993 | out: |
859 | return ret; | 994 | return ret; |
995 | |||
996 | out_mlock: | ||
997 | pte_unmap_unlock(pte, ptl); | ||
998 | |||
999 | |||
1000 | /* | ||
1001 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | ||
1002 | * unstable result and race. Plus, We can't wait here because | ||
1003 | * we now hold anon_vma->lock or mapping->i_mmap_lock. | ||
1004 | * if trylock failed, the page remain in evictable lru and later | ||
1005 | * vmscan could retry to move the page to unevictable lru if the | ||
1006 | * page is actually mlocked. | ||
1007 | */ | ||
1008 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
1009 | if (vma->vm_flags & VM_LOCKED) { | ||
1010 | mlock_vma_page(page); | ||
1011 | ret = SWAP_MLOCK; | ||
1012 | } | ||
1013 | up_read(&vma->vm_mm->mmap_sem); | ||
1014 | } | ||
1015 | return ret; | ||
860 | } | 1016 | } |
861 | 1017 | ||
862 | /* | 1018 | /* |
@@ -922,11 +1078,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
922 | return ret; | 1078 | return ret; |
923 | 1079 | ||
924 | /* | 1080 | /* |
925 | * MLOCK_PAGES => feature is configured. | 1081 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, |
926 | * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, | ||
927 | * keep the sem while scanning the cluster for mlocking pages. | 1082 | * keep the sem while scanning the cluster for mlocking pages. |
928 | */ | 1083 | */ |
929 | if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { | 1084 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { |
930 | locked_vma = (vma->vm_flags & VM_LOCKED); | 1085 | locked_vma = (vma->vm_flags & VM_LOCKED); |
931 | if (!locked_vma) | 1086 | if (!locked_vma) |
932 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ | 1087 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ |
@@ -967,7 +1122,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
967 | 1122 | ||
968 | page_remove_rmap(page); | 1123 | page_remove_rmap(page); |
969 | page_cache_release(page); | 1124 | page_cache_release(page); |
970 | dec_mm_counter(mm, file_rss); | 1125 | dec_mm_counter(mm, MM_FILEPAGES); |
971 | (*mapcount)--; | 1126 | (*mapcount)--; |
972 | } | 1127 | } |
973 | pte_unmap_unlock(pte - 1, ptl); | 1128 | pte_unmap_unlock(pte - 1, ptl); |
@@ -976,29 +1131,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
976 | return ret; | 1131 | return ret; |
977 | } | 1132 | } |
978 | 1133 | ||
979 | /* | ||
980 | * common handling for pages mapped in VM_LOCKED vmas | ||
981 | */ | ||
982 | static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) | ||
983 | { | ||
984 | int mlocked = 0; | ||
985 | |||
986 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
987 | if (vma->vm_flags & VM_LOCKED) { | ||
988 | mlock_vma_page(page); | ||
989 | mlocked++; /* really mlocked the page */ | ||
990 | } | ||
991 | up_read(&vma->vm_mm->mmap_sem); | ||
992 | } | ||
993 | return mlocked; | ||
994 | } | ||
995 | |||
996 | /** | 1134 | /** |
997 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based | 1135 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based |
998 | * rmap method | 1136 | * rmap method |
999 | * @page: the page to unmap/unlock | 1137 | * @page: the page to unmap/unlock |
1000 | * @unlock: request for unlock rather than unmap [unlikely] | 1138 | * @flags: action and flags |
1001 | * @migration: unmapping for migration - ignored if @unlock | ||
1002 | * | 1139 | * |
1003 | * Find all the mappings of a page using the mapping pointer and the vma chains | 1140 | * Find all the mappings of a page using the mapping pointer and the vma chains |
1004 | * contained in the anon_vma struct it points to. | 1141 | * contained in the anon_vma struct it points to. |
@@ -1013,43 +1150,24 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) | |||
1013 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | 1150 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) |
1014 | { | 1151 | { |
1015 | struct anon_vma *anon_vma; | 1152 | struct anon_vma *anon_vma; |
1016 | struct vm_area_struct *vma; | 1153 | struct anon_vma_chain *avc; |
1017 | unsigned int mlocked = 0; | ||
1018 | int ret = SWAP_AGAIN; | 1154 | int ret = SWAP_AGAIN; |
1019 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
1020 | |||
1021 | if (MLOCK_PAGES && unlikely(unlock)) | ||
1022 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
1023 | 1155 | ||
1024 | anon_vma = page_lock_anon_vma(page); | 1156 | anon_vma = page_lock_anon_vma(page); |
1025 | if (!anon_vma) | 1157 | if (!anon_vma) |
1026 | return ret; | 1158 | return ret; |
1027 | 1159 | ||
1028 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1160 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
1029 | if (MLOCK_PAGES && unlikely(unlock)) { | 1161 | struct vm_area_struct *vma = avc->vma; |
1030 | if (!((vma->vm_flags & VM_LOCKED) && | 1162 | unsigned long address = vma_address(page, vma); |
1031 | page_mapped_in_vma(page, vma))) | 1163 | if (address == -EFAULT) |
1032 | continue; /* must visit all unlocked vmas */ | 1164 | continue; |
1033 | ret = SWAP_MLOCK; /* saw at least one mlocked vma */ | 1165 | ret = try_to_unmap_one(page, vma, address, flags); |
1034 | } else { | 1166 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1035 | ret = try_to_unmap_one(page, vma, flags); | 1167 | break; |
1036 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
1037 | break; | ||
1038 | } | ||
1039 | if (ret == SWAP_MLOCK) { | ||
1040 | mlocked = try_to_mlock_page(page, vma); | ||
1041 | if (mlocked) | ||
1042 | break; /* stop if actually mlocked page */ | ||
1043 | } | ||
1044 | } | 1168 | } |
1045 | 1169 | ||
1046 | page_unlock_anon_vma(anon_vma); | 1170 | page_unlock_anon_vma(anon_vma); |
1047 | |||
1048 | if (mlocked) | ||
1049 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
1050 | else if (ret == SWAP_MLOCK) | ||
1051 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
1052 | |||
1053 | return ret; | 1171 | return ret; |
1054 | } | 1172 | } |
1055 | 1173 | ||
@@ -1079,48 +1197,30 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1079 | unsigned long max_nl_cursor = 0; | 1197 | unsigned long max_nl_cursor = 0; |
1080 | unsigned long max_nl_size = 0; | 1198 | unsigned long max_nl_size = 0; |
1081 | unsigned int mapcount; | 1199 | unsigned int mapcount; |
1082 | unsigned int mlocked = 0; | ||
1083 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
1084 | |||
1085 | if (MLOCK_PAGES && unlikely(unlock)) | ||
1086 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
1087 | 1200 | ||
1088 | spin_lock(&mapping->i_mmap_lock); | 1201 | spin_lock(&mapping->i_mmap_lock); |
1089 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1202 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1090 | if (MLOCK_PAGES && unlikely(unlock)) { | 1203 | unsigned long address = vma_address(page, vma); |
1091 | if (!((vma->vm_flags & VM_LOCKED) && | 1204 | if (address == -EFAULT) |
1092 | page_mapped_in_vma(page, vma))) | 1205 | continue; |
1093 | continue; /* must visit all vmas */ | 1206 | ret = try_to_unmap_one(page, vma, address, flags); |
1094 | ret = SWAP_MLOCK; | 1207 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1095 | } else { | 1208 | goto out; |
1096 | ret = try_to_unmap_one(page, vma, flags); | ||
1097 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
1098 | goto out; | ||
1099 | } | ||
1100 | if (ret == SWAP_MLOCK) { | ||
1101 | mlocked = try_to_mlock_page(page, vma); | ||
1102 | if (mlocked) | ||
1103 | break; /* stop if actually mlocked page */ | ||
1104 | } | ||
1105 | } | 1209 | } |
1106 | 1210 | ||
1107 | if (mlocked) | 1211 | if (list_empty(&mapping->i_mmap_nonlinear)) |
1108 | goto out; | 1212 | goto out; |
1109 | 1213 | ||
1110 | if (list_empty(&mapping->i_mmap_nonlinear)) | 1214 | /* |
1215 | * We don't bother to try to find the munlocked page in nonlinears. | ||
1216 | * It's costly. Instead, later, page reclaim logic may call | ||
1217 | * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. | ||
1218 | */ | ||
1219 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
1111 | goto out; | 1220 | goto out; |
1112 | 1221 | ||
1113 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1222 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1114 | shared.vm_set.list) { | 1223 | shared.vm_set.list) { |
1115 | if (MLOCK_PAGES && unlikely(unlock)) { | ||
1116 | if (!(vma->vm_flags & VM_LOCKED)) | ||
1117 | continue; /* must visit all vmas */ | ||
1118 | ret = SWAP_MLOCK; /* leave mlocked == 0 */ | ||
1119 | goto out; /* no need to look further */ | ||
1120 | } | ||
1121 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && | ||
1122 | (vma->vm_flags & VM_LOCKED)) | ||
1123 | continue; | ||
1124 | cursor = (unsigned long) vma->vm_private_data; | 1224 | cursor = (unsigned long) vma->vm_private_data; |
1125 | if (cursor > max_nl_cursor) | 1225 | if (cursor > max_nl_cursor) |
1126 | max_nl_cursor = cursor; | 1226 | max_nl_cursor = cursor; |
@@ -1153,16 +1253,12 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1153 | do { | 1253 | do { |
1154 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1254 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1155 | shared.vm_set.list) { | 1255 | shared.vm_set.list) { |
1156 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && | ||
1157 | (vma->vm_flags & VM_LOCKED)) | ||
1158 | continue; | ||
1159 | cursor = (unsigned long) vma->vm_private_data; | 1256 | cursor = (unsigned long) vma->vm_private_data; |
1160 | while ( cursor < max_nl_cursor && | 1257 | while ( cursor < max_nl_cursor && |
1161 | cursor < vma->vm_end - vma->vm_start) { | 1258 | cursor < vma->vm_end - vma->vm_start) { |
1162 | ret = try_to_unmap_cluster(cursor, &mapcount, | 1259 | if (try_to_unmap_cluster(cursor, &mapcount, |
1163 | vma, page); | 1260 | vma, page) == SWAP_MLOCK) |
1164 | if (ret == SWAP_MLOCK) | 1261 | ret = SWAP_MLOCK; |
1165 | mlocked = 2; /* to return below */ | ||
1166 | cursor += CLUSTER_SIZE; | 1262 | cursor += CLUSTER_SIZE; |
1167 | vma->vm_private_data = (void *) cursor; | 1263 | vma->vm_private_data = (void *) cursor; |
1168 | if ((int)mapcount <= 0) | 1264 | if ((int)mapcount <= 0) |
@@ -1183,10 +1279,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1183 | vma->vm_private_data = NULL; | 1279 | vma->vm_private_data = NULL; |
1184 | out: | 1280 | out: |
1185 | spin_unlock(&mapping->i_mmap_lock); | 1281 | spin_unlock(&mapping->i_mmap_lock); |
1186 | if (mlocked) | ||
1187 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
1188 | else if (ret == SWAP_MLOCK) | ||
1189 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
1190 | return ret; | 1282 | return ret; |
1191 | } | 1283 | } |
1192 | 1284 | ||
@@ -1210,7 +1302,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1210 | 1302 | ||
1211 | BUG_ON(!PageLocked(page)); | 1303 | BUG_ON(!PageLocked(page)); |
1212 | 1304 | ||
1213 | if (PageAnon(page)) | 1305 | if (unlikely(PageKsm(page))) |
1306 | ret = try_to_unmap_ksm(page, flags); | ||
1307 | else if (PageAnon(page)) | ||
1214 | ret = try_to_unmap_anon(page, flags); | 1308 | ret = try_to_unmap_anon(page, flags); |
1215 | else | 1309 | else |
1216 | ret = try_to_unmap_file(page, flags); | 1310 | ret = try_to_unmap_file(page, flags); |
@@ -1229,17 +1323,99 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1229 | * | 1323 | * |
1230 | * Return values are: | 1324 | * Return values are: |
1231 | * | 1325 | * |
1232 | * SWAP_SUCCESS - no vma's holding page mlocked. | 1326 | * SWAP_AGAIN - no vma is holding page mlocked, or, |
1233 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem | 1327 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem |
1328 | * SWAP_FAIL - page cannot be located at present | ||
1234 | * SWAP_MLOCK - page is now mlocked. | 1329 | * SWAP_MLOCK - page is now mlocked. |
1235 | */ | 1330 | */ |
1236 | int try_to_munlock(struct page *page) | 1331 | int try_to_munlock(struct page *page) |
1237 | { | 1332 | { |
1238 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | 1333 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); |
1239 | 1334 | ||
1240 | if (PageAnon(page)) | 1335 | if (unlikely(PageKsm(page))) |
1336 | return try_to_unmap_ksm(page, TTU_MUNLOCK); | ||
1337 | else if (PageAnon(page)) | ||
1241 | return try_to_unmap_anon(page, TTU_MUNLOCK); | 1338 | return try_to_unmap_anon(page, TTU_MUNLOCK); |
1242 | else | 1339 | else |
1243 | return try_to_unmap_file(page, TTU_MUNLOCK); | 1340 | return try_to_unmap_file(page, TTU_MUNLOCK); |
1244 | } | 1341 | } |
1245 | 1342 | ||
1343 | #ifdef CONFIG_MIGRATION | ||
1344 | /* | ||
1345 | * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): | ||
1346 | * Called by migrate.c to remove migration ptes, but might be used more later. | ||
1347 | */ | ||
1348 | static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | ||
1349 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1350 | { | ||
1351 | struct anon_vma *anon_vma; | ||
1352 | struct anon_vma_chain *avc; | ||
1353 | int ret = SWAP_AGAIN; | ||
1354 | |||
1355 | /* | ||
1356 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() | ||
1357 | * because that depends on page_mapped(); but not all its usages | ||
1358 | * are holding mmap_sem, which also gave the necessary guarantee | ||
1359 | * (that this anon_vma's slab has not already been destroyed). | ||
1360 | * This needs to be reviewed later: avoiding page_lock_anon_vma() | ||
1361 | * is risky, and currently limits the usefulness of rmap_walk(). | ||
1362 | */ | ||
1363 | anon_vma = page_anon_vma(page); | ||
1364 | if (!anon_vma) | ||
1365 | return ret; | ||
1366 | spin_lock(&anon_vma->lock); | ||
1367 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | ||
1368 | struct vm_area_struct *vma = avc->vma; | ||
1369 | unsigned long address = vma_address(page, vma); | ||
1370 | if (address == -EFAULT) | ||
1371 | continue; | ||
1372 | ret = rmap_one(page, vma, address, arg); | ||
1373 | if (ret != SWAP_AGAIN) | ||
1374 | break; | ||
1375 | } | ||
1376 | spin_unlock(&anon_vma->lock); | ||
1377 | return ret; | ||
1378 | } | ||
1379 | |||
1380 | static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | ||
1381 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1382 | { | ||
1383 | struct address_space *mapping = page->mapping; | ||
1384 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1385 | struct vm_area_struct *vma; | ||
1386 | struct prio_tree_iter iter; | ||
1387 | int ret = SWAP_AGAIN; | ||
1388 | |||
1389 | if (!mapping) | ||
1390 | return ret; | ||
1391 | spin_lock(&mapping->i_mmap_lock); | ||
1392 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
1393 | unsigned long address = vma_address(page, vma); | ||
1394 | if (address == -EFAULT) | ||
1395 | continue; | ||
1396 | ret = rmap_one(page, vma, address, arg); | ||
1397 | if (ret != SWAP_AGAIN) | ||
1398 | break; | ||
1399 | } | ||
1400 | /* | ||
1401 | * No nonlinear handling: being always shared, nonlinear vmas | ||
1402 | * never contain migration ptes. Decide what to do about this | ||
1403 | * limitation to linear when we need rmap_walk() on nonlinear. | ||
1404 | */ | ||
1405 | spin_unlock(&mapping->i_mmap_lock); | ||
1406 | return ret; | ||
1407 | } | ||
1408 | |||
1409 | int rmap_walk(struct page *page, int (*rmap_one)(struct page *, | ||
1410 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1411 | { | ||
1412 | VM_BUG_ON(!PageLocked(page)); | ||
1413 | |||
1414 | if (unlikely(PageKsm(page))) | ||
1415 | return rmap_walk_ksm(page, rmap_one, arg); | ||
1416 | else if (PageAnon(page)) | ||
1417 | return rmap_walk_anon(page, rmap_one, arg); | ||
1418 | else | ||
1419 | return rmap_walk_file(page, rmap_one, arg); | ||
1420 | } | ||
1421 | #endif /* CONFIG_MIGRATION */ | ||