diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /mm/rmap.c | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'mm/rmap.c')
-rw-r--r-- | mm/rmap.c | 523 |
1 files changed, 346 insertions, 177 deletions
@@ -24,22 +24,22 @@ | |||
24 | * inode->i_alloc_sem (vmtruncate_range) | 24 | * inode->i_alloc_sem (vmtruncate_range) |
25 | * mm->mmap_sem | 25 | * mm->mmap_sem |
26 | * page->flags PG_locked (lock_page) | 26 | * page->flags PG_locked (lock_page) |
27 | * mapping->i_mmap_lock | 27 | * mapping->i_mmap_mutex |
28 | * anon_vma->lock | 28 | * anon_vma->mutex |
29 | * mm->page_table_lock or pte_lock | 29 | * mm->page_table_lock or pte_lock |
30 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 30 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
31 | * swap_lock (in swap_duplicate, swap_info_get) | 31 | * swap_lock (in swap_duplicate, swap_info_get) |
32 | * mmlist_lock (in mmput, drain_mmlist and others) | 32 | * mmlist_lock (in mmput, drain_mmlist and others) |
33 | * mapping->private_lock (in __set_page_dirty_buffers) | 33 | * mapping->private_lock (in __set_page_dirty_buffers) |
34 | * inode_lock (in set_page_dirty's __mark_inode_dirty) | 34 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) |
35 | * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) | ||
35 | * sb_lock (within inode_lock in fs/fs-writeback.c) | 36 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
36 | * mapping->tree_lock (widely used, in set_page_dirty, | 37 | * mapping->tree_lock (widely used, in set_page_dirty, |
37 | * in arch-dependent flush_dcache_mmap_lock, | 38 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within inode_lock in __sync_single_inode) | 39 | * within inode_wb_list_lock in __sync_single_inode) |
39 | * | 40 | * |
40 | * (code doesn't rely on that order so it could be switched around) | 41 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) |
41 | * ->tasklist_lock | 42 | * ->tasklist_lock |
42 | * anon_vma->lock (memory_failure, collect_procs_anon) | ||
43 | * pte map lock | 43 | * pte map lock |
44 | */ | 44 | */ |
45 | 45 | ||
@@ -67,20 +67,56 @@ static struct kmem_cache *anon_vma_chain_cachep; | |||
67 | 67 | ||
68 | static inline struct anon_vma *anon_vma_alloc(void) | 68 | static inline struct anon_vma *anon_vma_alloc(void) |
69 | { | 69 | { |
70 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | 70 | struct anon_vma *anon_vma; |
71 | |||
72 | anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | ||
73 | if (anon_vma) { | ||
74 | atomic_set(&anon_vma->refcount, 1); | ||
75 | /* | ||
76 | * Initialise the anon_vma root to point to itself. If called | ||
77 | * from fork, the root will be reset to the parents anon_vma. | ||
78 | */ | ||
79 | anon_vma->root = anon_vma; | ||
80 | } | ||
81 | |||
82 | return anon_vma; | ||
71 | } | 83 | } |
72 | 84 | ||
73 | void anon_vma_free(struct anon_vma *anon_vma) | 85 | static inline void anon_vma_free(struct anon_vma *anon_vma) |
74 | { | 86 | { |
87 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); | ||
88 | |||
89 | /* | ||
90 | * Synchronize against page_lock_anon_vma() such that | ||
91 | * we can safely hold the lock without the anon_vma getting | ||
92 | * freed. | ||
93 | * | ||
94 | * Relies on the full mb implied by the atomic_dec_and_test() from | ||
95 | * put_anon_vma() against the acquire barrier implied by | ||
96 | * mutex_trylock() from page_lock_anon_vma(). This orders: | ||
97 | * | ||
98 | * page_lock_anon_vma() VS put_anon_vma() | ||
99 | * mutex_trylock() atomic_dec_and_test() | ||
100 | * LOCK MB | ||
101 | * atomic_read() mutex_is_locked() | ||
102 | * | ||
103 | * LOCK should suffice since the actual taking of the lock must | ||
104 | * happen _before_ what follows. | ||
105 | */ | ||
106 | if (mutex_is_locked(&anon_vma->root->mutex)) { | ||
107 | anon_vma_lock(anon_vma); | ||
108 | anon_vma_unlock(anon_vma); | ||
109 | } | ||
110 | |||
75 | kmem_cache_free(anon_vma_cachep, anon_vma); | 111 | kmem_cache_free(anon_vma_cachep, anon_vma); |
76 | } | 112 | } |
77 | 113 | ||
78 | static inline struct anon_vma_chain *anon_vma_chain_alloc(void) | 114 | static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) |
79 | { | 115 | { |
80 | return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); | 116 | return kmem_cache_alloc(anon_vma_chain_cachep, gfp); |
81 | } | 117 | } |
82 | 118 | ||
83 | void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | 119 | static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) |
84 | { | 120 | { |
85 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); | 121 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); |
86 | } | 122 | } |
@@ -94,7 +130,7 @@ void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | |||
94 | * anonymous pages mapped into it with that anon_vma. | 130 | * anonymous pages mapped into it with that anon_vma. |
95 | * | 131 | * |
96 | * The common case will be that we already have one, but if | 132 | * The common case will be that we already have one, but if |
97 | * if not we either need to find an adjacent mapping that we | 133 | * not we either need to find an adjacent mapping that we |
98 | * can re-use the anon_vma from (very common when the only | 134 | * can re-use the anon_vma from (very common when the only |
99 | * reason for splitting a vma has been mprotect()), or we | 135 | * reason for splitting a vma has been mprotect()), or we |
100 | * allocate a new one. | 136 | * allocate a new one. |
@@ -122,7 +158,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
122 | struct mm_struct *mm = vma->vm_mm; | 158 | struct mm_struct *mm = vma->vm_mm; |
123 | struct anon_vma *allocated; | 159 | struct anon_vma *allocated; |
124 | 160 | ||
125 | avc = anon_vma_chain_alloc(); | 161 | avc = anon_vma_chain_alloc(GFP_KERNEL); |
126 | if (!avc) | 162 | if (!avc) |
127 | goto out_enomem; | 163 | goto out_enomem; |
128 | 164 | ||
@@ -133,11 +169,6 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
133 | if (unlikely(!anon_vma)) | 169 | if (unlikely(!anon_vma)) |
134 | goto out_enomem_free_avc; | 170 | goto out_enomem_free_avc; |
135 | allocated = anon_vma; | 171 | allocated = anon_vma; |
136 | /* | ||
137 | * This VMA had no anon_vma yet. This anon_vma is | ||
138 | * the root of any anon_vma tree that might form. | ||
139 | */ | ||
140 | anon_vma->root = anon_vma; | ||
141 | } | 172 | } |
142 | 173 | ||
143 | anon_vma_lock(anon_vma); | 174 | anon_vma_lock(anon_vma); |
@@ -156,7 +187,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
156 | anon_vma_unlock(anon_vma); | 187 | anon_vma_unlock(anon_vma); |
157 | 188 | ||
158 | if (unlikely(allocated)) | 189 | if (unlikely(allocated)) |
159 | anon_vma_free(allocated); | 190 | put_anon_vma(allocated); |
160 | if (unlikely(avc)) | 191 | if (unlikely(avc)) |
161 | anon_vma_chain_free(avc); | 192 | anon_vma_chain_free(avc); |
162 | } | 193 | } |
@@ -168,6 +199,32 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
168 | return -ENOMEM; | 199 | return -ENOMEM; |
169 | } | 200 | } |
170 | 201 | ||
202 | /* | ||
203 | * This is a useful helper function for locking the anon_vma root as | ||
204 | * we traverse the vma->anon_vma_chain, looping over anon_vma's that | ||
205 | * have the same vma. | ||
206 | * | ||
207 | * Such anon_vma's should have the same root, so you'd expect to see | ||
208 | * just a single mutex_lock for the whole traversal. | ||
209 | */ | ||
210 | static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) | ||
211 | { | ||
212 | struct anon_vma *new_root = anon_vma->root; | ||
213 | if (new_root != root) { | ||
214 | if (WARN_ON_ONCE(root)) | ||
215 | mutex_unlock(&root->mutex); | ||
216 | root = new_root; | ||
217 | mutex_lock(&root->mutex); | ||
218 | } | ||
219 | return root; | ||
220 | } | ||
221 | |||
222 | static inline void unlock_anon_vma_root(struct anon_vma *root) | ||
223 | { | ||
224 | if (root) | ||
225 | mutex_unlock(&root->mutex); | ||
226 | } | ||
227 | |||
171 | static void anon_vma_chain_link(struct vm_area_struct *vma, | 228 | static void anon_vma_chain_link(struct vm_area_struct *vma, |
172 | struct anon_vma_chain *avc, | 229 | struct anon_vma_chain *avc, |
173 | struct anon_vma *anon_vma) | 230 | struct anon_vma *anon_vma) |
@@ -176,9 +233,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
176 | avc->anon_vma = anon_vma; | 233 | avc->anon_vma = anon_vma; |
177 | list_add(&avc->same_vma, &vma->anon_vma_chain); | 234 | list_add(&avc->same_vma, &vma->anon_vma_chain); |
178 | 235 | ||
179 | anon_vma_lock(anon_vma); | 236 | /* |
237 | * It's critical to add new vmas to the tail of the anon_vma, | ||
238 | * see comment in huge_memory.c:__split_huge_page(). | ||
239 | */ | ||
180 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | 240 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); |
181 | anon_vma_unlock(anon_vma); | ||
182 | } | 241 | } |
183 | 242 | ||
184 | /* | 243 | /* |
@@ -188,13 +247,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
188 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | 247 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) |
189 | { | 248 | { |
190 | struct anon_vma_chain *avc, *pavc; | 249 | struct anon_vma_chain *avc, *pavc; |
250 | struct anon_vma *root = NULL; | ||
191 | 251 | ||
192 | list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { | 252 | list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { |
193 | avc = anon_vma_chain_alloc(); | 253 | struct anon_vma *anon_vma; |
194 | if (!avc) | 254 | |
195 | goto enomem_failure; | 255 | avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); |
196 | anon_vma_chain_link(dst, avc, pavc->anon_vma); | 256 | if (unlikely(!avc)) { |
257 | unlock_anon_vma_root(root); | ||
258 | root = NULL; | ||
259 | avc = anon_vma_chain_alloc(GFP_KERNEL); | ||
260 | if (!avc) | ||
261 | goto enomem_failure; | ||
262 | } | ||
263 | anon_vma = pavc->anon_vma; | ||
264 | root = lock_anon_vma_root(root, anon_vma); | ||
265 | anon_vma_chain_link(dst, avc, anon_vma); | ||
197 | } | 266 | } |
267 | unlock_anon_vma_root(root); | ||
198 | return 0; | 268 | return 0; |
199 | 269 | ||
200 | enomem_failure: | 270 | enomem_failure: |
@@ -227,7 +297,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
227 | anon_vma = anon_vma_alloc(); | 297 | anon_vma = anon_vma_alloc(); |
228 | if (!anon_vma) | 298 | if (!anon_vma) |
229 | goto out_error; | 299 | goto out_error; |
230 | avc = anon_vma_chain_alloc(); | 300 | avc = anon_vma_chain_alloc(GFP_KERNEL); |
231 | if (!avc) | 301 | if (!avc) |
232 | goto out_error_free_anon_vma; | 302 | goto out_error_free_anon_vma; |
233 | 303 | ||
@@ -237,58 +307,63 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
237 | */ | 307 | */ |
238 | anon_vma->root = pvma->anon_vma->root; | 308 | anon_vma->root = pvma->anon_vma->root; |
239 | /* | 309 | /* |
240 | * With KSM refcounts, an anon_vma can stay around longer than the | 310 | * With refcounts, an anon_vma can stay around longer than the |
241 | * process it belongs to. The root anon_vma needs to be pinned | 311 | * process it belongs to. The root anon_vma needs to be pinned until |
242 | * until this anon_vma is freed, because the lock lives in the root. | 312 | * this anon_vma is freed, because the lock lives in the root. |
243 | */ | 313 | */ |
244 | get_anon_vma(anon_vma->root); | 314 | get_anon_vma(anon_vma->root); |
245 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | 315 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ |
246 | vma->anon_vma = anon_vma; | 316 | vma->anon_vma = anon_vma; |
317 | anon_vma_lock(anon_vma); | ||
247 | anon_vma_chain_link(vma, avc, anon_vma); | 318 | anon_vma_chain_link(vma, avc, anon_vma); |
319 | anon_vma_unlock(anon_vma); | ||
248 | 320 | ||
249 | return 0; | 321 | return 0; |
250 | 322 | ||
251 | out_error_free_anon_vma: | 323 | out_error_free_anon_vma: |
252 | anon_vma_free(anon_vma); | 324 | put_anon_vma(anon_vma); |
253 | out_error: | 325 | out_error: |
254 | unlink_anon_vmas(vma); | 326 | unlink_anon_vmas(vma); |
255 | return -ENOMEM; | 327 | return -ENOMEM; |
256 | } | 328 | } |
257 | 329 | ||
258 | static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) | 330 | void unlink_anon_vmas(struct vm_area_struct *vma) |
259 | { | 331 | { |
260 | struct anon_vma *anon_vma = anon_vma_chain->anon_vma; | 332 | struct anon_vma_chain *avc, *next; |
261 | int empty; | 333 | struct anon_vma *root = NULL; |
262 | 334 | ||
263 | /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ | 335 | /* |
264 | if (!anon_vma) | 336 | * Unlink each anon_vma chained to the VMA. This list is ordered |
265 | return; | 337 | * from newest to oldest, ensuring the root anon_vma gets freed last. |
338 | */ | ||
339 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | ||
340 | struct anon_vma *anon_vma = avc->anon_vma; | ||
266 | 341 | ||
267 | anon_vma_lock(anon_vma); | 342 | root = lock_anon_vma_root(root, anon_vma); |
268 | list_del(&anon_vma_chain->same_anon_vma); | 343 | list_del(&avc->same_anon_vma); |
269 | 344 | ||
270 | /* We must garbage collect the anon_vma if it's empty */ | 345 | /* |
271 | empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); | 346 | * Leave empty anon_vmas on the list - we'll need |
272 | anon_vma_unlock(anon_vma); | 347 | * to free them outside the lock. |
348 | */ | ||
349 | if (list_empty(&anon_vma->head)) | ||
350 | continue; | ||
273 | 351 | ||
274 | if (empty) { | 352 | list_del(&avc->same_vma); |
275 | /* We no longer need the root anon_vma */ | 353 | anon_vma_chain_free(avc); |
276 | if (anon_vma->root != anon_vma) | ||
277 | drop_anon_vma(anon_vma->root); | ||
278 | anon_vma_free(anon_vma); | ||
279 | } | 354 | } |
280 | } | 355 | unlock_anon_vma_root(root); |
281 | |||
282 | void unlink_anon_vmas(struct vm_area_struct *vma) | ||
283 | { | ||
284 | struct anon_vma_chain *avc, *next; | ||
285 | 356 | ||
286 | /* | 357 | /* |
287 | * Unlink each anon_vma chained to the VMA. This list is ordered | 358 | * Iterate the list once more, it now only contains empty and unlinked |
288 | * from newest to oldest, ensuring the root anon_vma gets freed last. | 359 | * anon_vmas, destroy them. Could not do before due to __put_anon_vma() |
360 | * needing to acquire the anon_vma->root->mutex. | ||
289 | */ | 361 | */ |
290 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | 362 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { |
291 | anon_vma_unlink(avc); | 363 | struct anon_vma *anon_vma = avc->anon_vma; |
364 | |||
365 | put_anon_vma(anon_vma); | ||
366 | |||
292 | list_del(&avc->same_vma); | 367 | list_del(&avc->same_vma); |
293 | anon_vma_chain_free(avc); | 368 | anon_vma_chain_free(avc); |
294 | } | 369 | } |
@@ -298,8 +373,8 @@ static void anon_vma_ctor(void *data) | |||
298 | { | 373 | { |
299 | struct anon_vma *anon_vma = data; | 374 | struct anon_vma *anon_vma = data; |
300 | 375 | ||
301 | spin_lock_init(&anon_vma->lock); | 376 | mutex_init(&anon_vma->mutex); |
302 | anonvma_external_refcount_init(anon_vma); | 377 | atomic_set(&anon_vma->refcount, 0); |
303 | INIT_LIST_HEAD(&anon_vma->head); | 378 | INIT_LIST_HEAD(&anon_vma->head); |
304 | } | 379 | } |
305 | 380 | ||
@@ -311,12 +386,31 @@ void __init anon_vma_init(void) | |||
311 | } | 386 | } |
312 | 387 | ||
313 | /* | 388 | /* |
314 | * Getting a lock on a stable anon_vma from a page off the LRU is | 389 | * Getting a lock on a stable anon_vma from a page off the LRU is tricky! |
315 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. | 390 | * |
391 | * Since there is no serialization what so ever against page_remove_rmap() | ||
392 | * the best this function can do is return a locked anon_vma that might | ||
393 | * have been relevant to this page. | ||
394 | * | ||
395 | * The page might have been remapped to a different anon_vma or the anon_vma | ||
396 | * returned may already be freed (and even reused). | ||
397 | * | ||
398 | * In case it was remapped to a different anon_vma, the new anon_vma will be a | ||
399 | * child of the old anon_vma, and the anon_vma lifetime rules will therefore | ||
400 | * ensure that any anon_vma obtained from the page will still be valid for as | ||
401 | * long as we observe page_mapped() [ hence all those page_mapped() tests ]. | ||
402 | * | ||
403 | * All users of this function must be very careful when walking the anon_vma | ||
404 | * chain and verify that the page in question is indeed mapped in it | ||
405 | * [ something equivalent to page_mapped_in_vma() ]. | ||
406 | * | ||
407 | * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() | ||
408 | * that the anon_vma pointer from page->mapping is valid if there is a | ||
409 | * mapcount, we can dereference the anon_vma after observing those. | ||
316 | */ | 410 | */ |
317 | struct anon_vma *page_lock_anon_vma(struct page *page) | 411 | struct anon_vma *page_get_anon_vma(struct page *page) |
318 | { | 412 | { |
319 | struct anon_vma *anon_vma, *root_anon_vma; | 413 | struct anon_vma *anon_vma = NULL; |
320 | unsigned long anon_mapping; | 414 | unsigned long anon_mapping; |
321 | 415 | ||
322 | rcu_read_lock(); | 416 | rcu_read_lock(); |
@@ -327,30 +421,100 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
327 | goto out; | 421 | goto out; |
328 | 422 | ||
329 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 423 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
330 | root_anon_vma = ACCESS_ONCE(anon_vma->root); | 424 | if (!atomic_inc_not_zero(&anon_vma->refcount)) { |
331 | spin_lock(&root_anon_vma->lock); | 425 | anon_vma = NULL; |
426 | goto out; | ||
427 | } | ||
332 | 428 | ||
333 | /* | 429 | /* |
334 | * If this page is still mapped, then its anon_vma cannot have been | 430 | * If this page is still mapped, then its anon_vma cannot have been |
335 | * freed. But if it has been unmapped, we have no security against | 431 | * freed. But if it has been unmapped, we have no security against the |
336 | * the anon_vma structure being freed and reused (for another anon_vma: | 432 | * anon_vma structure being freed and reused (for another anon_vma: |
337 | * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot | 433 | * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() |
338 | * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting | 434 | * above cannot corrupt). |
339 | * anon_vma->root before page_unlock_anon_vma() is called to unlock. | ||
340 | */ | 435 | */ |
341 | if (page_mapped(page)) | 436 | if (!page_mapped(page)) { |
342 | return anon_vma; | 437 | put_anon_vma(anon_vma); |
438 | anon_vma = NULL; | ||
439 | } | ||
440 | out: | ||
441 | rcu_read_unlock(); | ||
442 | |||
443 | return anon_vma; | ||
444 | } | ||
445 | |||
446 | /* | ||
447 | * Similar to page_get_anon_vma() except it locks the anon_vma. | ||
448 | * | ||
449 | * Its a little more complex as it tries to keep the fast path to a single | ||
450 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a | ||
451 | * reference like with page_get_anon_vma() and then block on the mutex. | ||
452 | */ | ||
453 | struct anon_vma *page_lock_anon_vma(struct page *page) | ||
454 | { | ||
455 | struct anon_vma *anon_vma = NULL; | ||
456 | struct anon_vma *root_anon_vma; | ||
457 | unsigned long anon_mapping; | ||
458 | |||
459 | rcu_read_lock(); | ||
460 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); | ||
461 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | ||
462 | goto out; | ||
463 | if (!page_mapped(page)) | ||
464 | goto out; | ||
465 | |||
466 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | ||
467 | root_anon_vma = ACCESS_ONCE(anon_vma->root); | ||
468 | if (mutex_trylock(&root_anon_vma->mutex)) { | ||
469 | /* | ||
470 | * If the page is still mapped, then this anon_vma is still | ||
471 | * its anon_vma, and holding the mutex ensures that it will | ||
472 | * not go away, see anon_vma_free(). | ||
473 | */ | ||
474 | if (!page_mapped(page)) { | ||
475 | mutex_unlock(&root_anon_vma->mutex); | ||
476 | anon_vma = NULL; | ||
477 | } | ||
478 | goto out; | ||
479 | } | ||
480 | |||
481 | /* trylock failed, we got to sleep */ | ||
482 | if (!atomic_inc_not_zero(&anon_vma->refcount)) { | ||
483 | anon_vma = NULL; | ||
484 | goto out; | ||
485 | } | ||
486 | |||
487 | if (!page_mapped(page)) { | ||
488 | put_anon_vma(anon_vma); | ||
489 | anon_vma = NULL; | ||
490 | goto out; | ||
491 | } | ||
492 | |||
493 | /* we pinned the anon_vma, its safe to sleep */ | ||
494 | rcu_read_unlock(); | ||
495 | anon_vma_lock(anon_vma); | ||
496 | |||
497 | if (atomic_dec_and_test(&anon_vma->refcount)) { | ||
498 | /* | ||
499 | * Oops, we held the last refcount, release the lock | ||
500 | * and bail -- can't simply use put_anon_vma() because | ||
501 | * we'll deadlock on the anon_vma_lock() recursion. | ||
502 | */ | ||
503 | anon_vma_unlock(anon_vma); | ||
504 | __put_anon_vma(anon_vma); | ||
505 | anon_vma = NULL; | ||
506 | } | ||
507 | |||
508 | return anon_vma; | ||
343 | 509 | ||
344 | spin_unlock(&root_anon_vma->lock); | ||
345 | out: | 510 | out: |
346 | rcu_read_unlock(); | 511 | rcu_read_unlock(); |
347 | return NULL; | 512 | return anon_vma; |
348 | } | 513 | } |
349 | 514 | ||
350 | void page_unlock_anon_vma(struct anon_vma *anon_vma) | 515 | void page_unlock_anon_vma(struct anon_vma *anon_vma) |
351 | { | 516 | { |
352 | anon_vma_unlock(anon_vma); | 517 | anon_vma_unlock(anon_vma); |
353 | rcu_read_unlock(); | ||
354 | } | 518 | } |
355 | 519 | ||
356 | /* | 520 | /* |
@@ -358,7 +522,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) | |||
358 | * Returns virtual address or -EFAULT if page's index/offset is not | 522 | * Returns virtual address or -EFAULT if page's index/offset is not |
359 | * within the range mapped the @vma. | 523 | * within the range mapped the @vma. |
360 | */ | 524 | */ |
361 | static inline unsigned long | 525 | inline unsigned long |
362 | vma_address(struct page *page, struct vm_area_struct *vma) | 526 | vma_address(struct page *page, struct vm_area_struct *vma) |
363 | { | 527 | { |
364 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 528 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
@@ -407,7 +571,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
407 | * | 571 | * |
408 | * On success returns with pte mapped and locked. | 572 | * On success returns with pte mapped and locked. |
409 | */ | 573 | */ |
410 | pte_t *page_check_address(struct page *page, struct mm_struct *mm, | 574 | pte_t *__page_check_address(struct page *page, struct mm_struct *mm, |
411 | unsigned long address, spinlock_t **ptlp, int sync) | 575 | unsigned long address, spinlock_t **ptlp, int sync) |
412 | { | 576 | { |
413 | pgd_t *pgd; | 577 | pgd_t *pgd; |
@@ -433,6 +597,8 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, | |||
433 | pmd = pmd_offset(pud, address); | 597 | pmd = pmd_offset(pud, address); |
434 | if (!pmd_present(*pmd)) | 598 | if (!pmd_present(*pmd)) |
435 | return NULL; | 599 | return NULL; |
600 | if (pmd_trans_huge(*pmd)) | ||
601 | return NULL; | ||
436 | 602 | ||
437 | pte = pte_offset_map(pmd, address); | 603 | pte = pte_offset_map(pmd, address); |
438 | /* Make a quick check before getting the lock */ | 604 | /* Make a quick check before getting the lock */ |
@@ -487,35 +653,65 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
487 | unsigned long *vm_flags) | 653 | unsigned long *vm_flags) |
488 | { | 654 | { |
489 | struct mm_struct *mm = vma->vm_mm; | 655 | struct mm_struct *mm = vma->vm_mm; |
490 | pte_t *pte; | ||
491 | spinlock_t *ptl; | ||
492 | int referenced = 0; | 656 | int referenced = 0; |
493 | 657 | ||
494 | pte = page_check_address(page, mm, address, &ptl, 0); | 658 | if (unlikely(PageTransHuge(page))) { |
495 | if (!pte) | 659 | pmd_t *pmd; |
496 | goto out; | ||
497 | |||
498 | /* | ||
499 | * Don't want to elevate referenced for mlocked page that gets this far, | ||
500 | * in order that it progresses to try_to_unmap and is moved to the | ||
501 | * unevictable list. | ||
502 | */ | ||
503 | if (vma->vm_flags & VM_LOCKED) { | ||
504 | *mapcount = 1; /* break early from loop */ | ||
505 | *vm_flags |= VM_LOCKED; | ||
506 | goto out_unmap; | ||
507 | } | ||
508 | 660 | ||
509 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 661 | spin_lock(&mm->page_table_lock); |
510 | /* | 662 | /* |
511 | * Don't treat a reference through a sequentially read | 663 | * rmap might return false positives; we must filter |
512 | * mapping as such. If the page has been used in | 664 | * these out using page_check_address_pmd(). |
513 | * another mapping, we will catch it; if this other | ||
514 | * mapping is already gone, the unmap path will have | ||
515 | * set PG_referenced or activated the page. | ||
516 | */ | 665 | */ |
517 | if (likely(!VM_SequentialReadHint(vma))) | 666 | pmd = page_check_address_pmd(page, mm, address, |
667 | PAGE_CHECK_ADDRESS_PMD_FLAG); | ||
668 | if (!pmd) { | ||
669 | spin_unlock(&mm->page_table_lock); | ||
670 | goto out; | ||
671 | } | ||
672 | |||
673 | if (vma->vm_flags & VM_LOCKED) { | ||
674 | spin_unlock(&mm->page_table_lock); | ||
675 | *mapcount = 0; /* break early from loop */ | ||
676 | *vm_flags |= VM_LOCKED; | ||
677 | goto out; | ||
678 | } | ||
679 | |||
680 | /* go ahead even if the pmd is pmd_trans_splitting() */ | ||
681 | if (pmdp_clear_flush_young_notify(vma, address, pmd)) | ||
518 | referenced++; | 682 | referenced++; |
683 | spin_unlock(&mm->page_table_lock); | ||
684 | } else { | ||
685 | pte_t *pte; | ||
686 | spinlock_t *ptl; | ||
687 | |||
688 | /* | ||
689 | * rmap might return false positives; we must filter | ||
690 | * these out using page_check_address(). | ||
691 | */ | ||
692 | pte = page_check_address(page, mm, address, &ptl, 0); | ||
693 | if (!pte) | ||
694 | goto out; | ||
695 | |||
696 | if (vma->vm_flags & VM_LOCKED) { | ||
697 | pte_unmap_unlock(pte, ptl); | ||
698 | *mapcount = 0; /* break early from loop */ | ||
699 | *vm_flags |= VM_LOCKED; | ||
700 | goto out; | ||
701 | } | ||
702 | |||
703 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | ||
704 | /* | ||
705 | * Don't treat a reference through a sequentially read | ||
706 | * mapping as such. If the page has been used in | ||
707 | * another mapping, we will catch it; if this other | ||
708 | * mapping is already gone, the unmap path will have | ||
709 | * set PG_referenced or activated the page. | ||
710 | */ | ||
711 | if (likely(!VM_SequentialReadHint(vma))) | ||
712 | referenced++; | ||
713 | } | ||
714 | pte_unmap_unlock(pte, ptl); | ||
519 | } | 715 | } |
520 | 716 | ||
521 | /* Pretend the page is referenced if the task has the | 717 | /* Pretend the page is referenced if the task has the |
@@ -524,9 +720,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
524 | rwsem_is_locked(&mm->mmap_sem)) | 720 | rwsem_is_locked(&mm->mmap_sem)) |
525 | referenced++; | 721 | referenced++; |
526 | 722 | ||
527 | out_unmap: | ||
528 | (*mapcount)--; | 723 | (*mapcount)--; |
529 | pte_unmap_unlock(pte, ptl); | ||
530 | 724 | ||
531 | if (referenced) | 725 | if (referenced) |
532 | *vm_flags |= vma->vm_flags; | 726 | *vm_flags |= vma->vm_flags; |
@@ -605,14 +799,14 @@ static int page_referenced_file(struct page *page, | |||
605 | * The page lock not only makes sure that page->mapping cannot | 799 | * The page lock not only makes sure that page->mapping cannot |
606 | * suddenly be NULLified by truncation, it makes sure that the | 800 | * suddenly be NULLified by truncation, it makes sure that the |
607 | * structure at mapping cannot be freed and reused yet, | 801 | * structure at mapping cannot be freed and reused yet, |
608 | * so we can safely take mapping->i_mmap_lock. | 802 | * so we can safely take mapping->i_mmap_mutex. |
609 | */ | 803 | */ |
610 | BUG_ON(!PageLocked(page)); | 804 | BUG_ON(!PageLocked(page)); |
611 | 805 | ||
612 | spin_lock(&mapping->i_mmap_lock); | 806 | mutex_lock(&mapping->i_mmap_mutex); |
613 | 807 | ||
614 | /* | 808 | /* |
615 | * i_mmap_lock does not stabilize mapcount at all, but mapcount | 809 | * i_mmap_mutex does not stabilize mapcount at all, but mapcount |
616 | * is more likely to be accurate if we note it after spinning. | 810 | * is more likely to be accurate if we note it after spinning. |
617 | */ | 811 | */ |
618 | mapcount = page_mapcount(page); | 812 | mapcount = page_mapcount(page); |
@@ -634,7 +828,7 @@ static int page_referenced_file(struct page *page, | |||
634 | break; | 828 | break; |
635 | } | 829 | } |
636 | 830 | ||
637 | spin_unlock(&mapping->i_mmap_lock); | 831 | mutex_unlock(&mapping->i_mmap_mutex); |
638 | return referenced; | 832 | return referenced; |
639 | } | 833 | } |
640 | 834 | ||
@@ -678,7 +872,7 @@ int page_referenced(struct page *page, | |||
678 | unlock_page(page); | 872 | unlock_page(page); |
679 | } | 873 | } |
680 | out: | 874 | out: |
681 | if (page_test_and_clear_young(page)) | 875 | if (page_test_and_clear_young(page_to_pfn(page))) |
682 | referenced++; | 876 | referenced++; |
683 | 877 | ||
684 | return referenced; | 878 | return referenced; |
@@ -721,7 +915,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
721 | 915 | ||
722 | BUG_ON(PageAnon(page)); | 916 | BUG_ON(PageAnon(page)); |
723 | 917 | ||
724 | spin_lock(&mapping->i_mmap_lock); | 918 | mutex_lock(&mapping->i_mmap_mutex); |
725 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 919 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
726 | if (vma->vm_flags & VM_SHARED) { | 920 | if (vma->vm_flags & VM_SHARED) { |
727 | unsigned long address = vma_address(page, vma); | 921 | unsigned long address = vma_address(page, vma); |
@@ -730,7 +924,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
730 | ret += page_mkclean_one(page, vma, address); | 924 | ret += page_mkclean_one(page, vma, address); |
731 | } | 925 | } |
732 | } | 926 | } |
733 | spin_unlock(&mapping->i_mmap_lock); | 927 | mutex_unlock(&mapping->i_mmap_mutex); |
734 | return ret; | 928 | return ret; |
735 | } | 929 | } |
736 | 930 | ||
@@ -744,10 +938,8 @@ int page_mkclean(struct page *page) | |||
744 | struct address_space *mapping = page_mapping(page); | 938 | struct address_space *mapping = page_mapping(page); |
745 | if (mapping) { | 939 | if (mapping) { |
746 | ret = page_mkclean_file(mapping, page); | 940 | ret = page_mkclean_file(mapping, page); |
747 | if (page_test_dirty(page)) { | 941 | if (page_test_and_clear_dirty(page_to_pfn(page), 1)) |
748 | page_clear_dirty(page); | ||
749 | ret = 1; | 942 | ret = 1; |
750 | } | ||
751 | } | 943 | } |
752 | } | 944 | } |
753 | 945 | ||
@@ -780,10 +972,10 @@ void page_move_anon_rmap(struct page *page, | |||
780 | } | 972 | } |
781 | 973 | ||
782 | /** | 974 | /** |
783 | * __page_set_anon_rmap - setup new anonymous rmap | 975 | * __page_set_anon_rmap - set up new anonymous rmap |
784 | * @page: the page to add the mapping to | 976 | * @page: Page to add to rmap |
785 | * @vma: the vm area in which the mapping is added | 977 | * @vma: VM area to add page to. |
786 | * @address: the user virtual address mapped | 978 | * @address: User virtual address of the mapping |
787 | * @exclusive: the page is exclusively owned by the current process | 979 | * @exclusive: the page is exclusively owned by the current process |
788 | */ | 980 | */ |
789 | static void __page_set_anon_rmap(struct page *page, | 981 | static void __page_set_anon_rmap(struct page *page, |
@@ -793,25 +985,16 @@ static void __page_set_anon_rmap(struct page *page, | |||
793 | 985 | ||
794 | BUG_ON(!anon_vma); | 986 | BUG_ON(!anon_vma); |
795 | 987 | ||
988 | if (PageAnon(page)) | ||
989 | return; | ||
990 | |||
796 | /* | 991 | /* |
797 | * If the page isn't exclusively mapped into this vma, | 992 | * If the page isn't exclusively mapped into this vma, |
798 | * we must use the _oldest_ possible anon_vma for the | 993 | * we must use the _oldest_ possible anon_vma for the |
799 | * page mapping! | 994 | * page mapping! |
800 | */ | 995 | */ |
801 | if (!exclusive) { | 996 | if (!exclusive) |
802 | if (PageAnon(page)) | ||
803 | return; | ||
804 | anon_vma = anon_vma->root; | 997 | anon_vma = anon_vma->root; |
805 | } else { | ||
806 | /* | ||
807 | * In this case, swapped-out-but-not-discarded swap-cache | ||
808 | * is remapped. So, no need to update page->mapping here. | ||
809 | * We convice anon_vma poitned by page->mapping is not obsolete | ||
810 | * because vma->anon_vma is necessary to be a family of it. | ||
811 | */ | ||
812 | if (PageAnon(page)) | ||
813 | return; | ||
814 | } | ||
815 | 998 | ||
816 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 999 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
817 | page->mapping = (struct address_space *) anon_vma; | 1000 | page->mapping = (struct address_space *) anon_vma; |
@@ -871,13 +1054,18 @@ void do_page_add_anon_rmap(struct page *page, | |||
871 | struct vm_area_struct *vma, unsigned long address, int exclusive) | 1054 | struct vm_area_struct *vma, unsigned long address, int exclusive) |
872 | { | 1055 | { |
873 | int first = atomic_inc_and_test(&page->_mapcount); | 1056 | int first = atomic_inc_and_test(&page->_mapcount); |
874 | if (first) | 1057 | if (first) { |
875 | __inc_zone_page_state(page, NR_ANON_PAGES); | 1058 | if (!PageTransHuge(page)) |
1059 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
1060 | else | ||
1061 | __inc_zone_page_state(page, | ||
1062 | NR_ANON_TRANSPARENT_HUGEPAGES); | ||
1063 | } | ||
876 | if (unlikely(PageKsm(page))) | 1064 | if (unlikely(PageKsm(page))) |
877 | return; | 1065 | return; |
878 | 1066 | ||
879 | VM_BUG_ON(!PageLocked(page)); | 1067 | VM_BUG_ON(!PageLocked(page)); |
880 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 1068 | /* address might be in next vma when migration races vma_adjust */ |
881 | if (first) | 1069 | if (first) |
882 | __page_set_anon_rmap(page, vma, address, exclusive); | 1070 | __page_set_anon_rmap(page, vma, address, exclusive); |
883 | else | 1071 | else |
@@ -900,7 +1088,10 @@ void page_add_new_anon_rmap(struct page *page, | |||
900 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 1088 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
901 | SetPageSwapBacked(page); | 1089 | SetPageSwapBacked(page); |
902 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 1090 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
903 | __inc_zone_page_state(page, NR_ANON_PAGES); | 1091 | if (!PageTransHuge(page)) |
1092 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
1093 | else | ||
1094 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | ||
904 | __page_set_anon_rmap(page, vma, address, 1); | 1095 | __page_set_anon_rmap(page, vma, address, 1); |
905 | if (page_evictable(page, vma)) | 1096 | if (page_evictable(page, vma)) |
906 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 1097 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
@@ -918,7 +1109,7 @@ void page_add_file_rmap(struct page *page) | |||
918 | { | 1109 | { |
919 | if (atomic_inc_and_test(&page->_mapcount)) { | 1110 | if (atomic_inc_and_test(&page->_mapcount)) { |
920 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1111 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
921 | mem_cgroup_update_file_mapped(page, 1); | 1112 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); |
922 | } | 1113 | } |
923 | } | 1114 | } |
924 | 1115 | ||
@@ -941,10 +1132,9 @@ void page_remove_rmap(struct page *page) | |||
941 | * not if it's in swapcache - there might be another pte slot | 1132 | * not if it's in swapcache - there might be another pte slot |
942 | * containing the swap entry, but page not yet written to swap. | 1133 | * containing the swap entry, but page not yet written to swap. |
943 | */ | 1134 | */ |
944 | if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { | 1135 | if ((!PageAnon(page) || PageSwapCache(page)) && |
945 | page_clear_dirty(page); | 1136 | page_test_and_clear_dirty(page_to_pfn(page), 1)) |
946 | set_page_dirty(page); | 1137 | set_page_dirty(page); |
947 | } | ||
948 | /* | 1138 | /* |
949 | * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED | 1139 | * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED |
950 | * and not charged by memcg for now. | 1140 | * and not charged by memcg for now. |
@@ -953,10 +1143,14 @@ void page_remove_rmap(struct page *page) | |||
953 | return; | 1143 | return; |
954 | if (PageAnon(page)) { | 1144 | if (PageAnon(page)) { |
955 | mem_cgroup_uncharge_page(page); | 1145 | mem_cgroup_uncharge_page(page); |
956 | __dec_zone_page_state(page, NR_ANON_PAGES); | 1146 | if (!PageTransHuge(page)) |
1147 | __dec_zone_page_state(page, NR_ANON_PAGES); | ||
1148 | else | ||
1149 | __dec_zone_page_state(page, | ||
1150 | NR_ANON_TRANSPARENT_HUGEPAGES); | ||
957 | } else { | 1151 | } else { |
958 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 1152 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
959 | mem_cgroup_update_file_mapped(page, -1); | 1153 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); |
960 | } | 1154 | } |
961 | /* | 1155 | /* |
962 | * It would be tidy to reset the PageAnon mapping here, | 1156 | * It would be tidy to reset the PageAnon mapping here, |
@@ -1078,7 +1272,7 @@ out_mlock: | |||
1078 | /* | 1272 | /* |
1079 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | 1273 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes |
1080 | * unstable result and race. Plus, We can't wait here because | 1274 | * unstable result and race. Plus, We can't wait here because |
1081 | * we now hold anon_vma->lock or mapping->i_mmap_lock. | 1275 | * we now hold anon_vma->mutex or mapping->i_mmap_mutex. |
1082 | * if trylock failed, the page remain in evictable lru and later | 1276 | * if trylock failed, the page remain in evictable lru and later |
1083 | * vmscan could retry to move the page to unevictable lru if the | 1277 | * vmscan could retry to move the page to unevictable lru if the |
1084 | * page is actually mlocked. | 1278 | * page is actually mlocked. |
@@ -1209,7 +1403,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1209 | return ret; | 1403 | return ret; |
1210 | } | 1404 | } |
1211 | 1405 | ||
1212 | static bool is_vma_temporary_stack(struct vm_area_struct *vma) | 1406 | bool is_vma_temporary_stack(struct vm_area_struct *vma) |
1213 | { | 1407 | { |
1214 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | 1408 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); |
1215 | 1409 | ||
@@ -1304,7 +1498,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1304 | unsigned long max_nl_size = 0; | 1498 | unsigned long max_nl_size = 0; |
1305 | unsigned int mapcount; | 1499 | unsigned int mapcount; |
1306 | 1500 | ||
1307 | spin_lock(&mapping->i_mmap_lock); | 1501 | mutex_lock(&mapping->i_mmap_mutex); |
1308 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1502 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1309 | unsigned long address = vma_address(page, vma); | 1503 | unsigned long address = vma_address(page, vma); |
1310 | if (address == -EFAULT) | 1504 | if (address == -EFAULT) |
@@ -1350,7 +1544,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1350 | mapcount = page_mapcount(page); | 1544 | mapcount = page_mapcount(page); |
1351 | if (!mapcount) | 1545 | if (!mapcount) |
1352 | goto out; | 1546 | goto out; |
1353 | cond_resched_lock(&mapping->i_mmap_lock); | 1547 | cond_resched(); |
1354 | 1548 | ||
1355 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; | 1549 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; |
1356 | if (max_nl_cursor == 0) | 1550 | if (max_nl_cursor == 0) |
@@ -1372,7 +1566,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1372 | } | 1566 | } |
1373 | vma->vm_private_data = (void *) max_nl_cursor; | 1567 | vma->vm_private_data = (void *) max_nl_cursor; |
1374 | } | 1568 | } |
1375 | cond_resched_lock(&mapping->i_mmap_lock); | 1569 | cond_resched(); |
1376 | max_nl_cursor += CLUSTER_SIZE; | 1570 | max_nl_cursor += CLUSTER_SIZE; |
1377 | } while (max_nl_cursor <= max_nl_size); | 1571 | } while (max_nl_cursor <= max_nl_size); |
1378 | 1572 | ||
@@ -1384,7 +1578,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1384 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | 1578 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) |
1385 | vma->vm_private_data = NULL; | 1579 | vma->vm_private_data = NULL; |
1386 | out: | 1580 | out: |
1387 | spin_unlock(&mapping->i_mmap_lock); | 1581 | mutex_unlock(&mapping->i_mmap_mutex); |
1388 | return ret; | 1582 | return ret; |
1389 | } | 1583 | } |
1390 | 1584 | ||
@@ -1407,6 +1601,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1407 | int ret; | 1601 | int ret; |
1408 | 1602 | ||
1409 | BUG_ON(!PageLocked(page)); | 1603 | BUG_ON(!PageLocked(page)); |
1604 | VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); | ||
1410 | 1605 | ||
1411 | if (unlikely(PageKsm(page))) | 1606 | if (unlikely(PageKsm(page))) |
1412 | ret = try_to_unmap_ksm(page, flags); | 1607 | ret = try_to_unmap_ksm(page, flags); |
@@ -1446,41 +1641,15 @@ int try_to_munlock(struct page *page) | |||
1446 | return try_to_unmap_file(page, TTU_MUNLOCK); | 1641 | return try_to_unmap_file(page, TTU_MUNLOCK); |
1447 | } | 1642 | } |
1448 | 1643 | ||
1449 | #if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) | 1644 | void __put_anon_vma(struct anon_vma *anon_vma) |
1450 | /* | ||
1451 | * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root | ||
1452 | * if necessary. Be careful to do all the tests under the lock. Once | ||
1453 | * we know we are the last user, nobody else can get a reference and we | ||
1454 | * can do the freeing without the lock. | ||
1455 | */ | ||
1456 | void drop_anon_vma(struct anon_vma *anon_vma) | ||
1457 | { | 1645 | { |
1458 | BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0); | 1646 | struct anon_vma *root = anon_vma->root; |
1459 | if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { | ||
1460 | struct anon_vma *root = anon_vma->root; | ||
1461 | int empty = list_empty(&anon_vma->head); | ||
1462 | int last_root_user = 0; | ||
1463 | int root_empty = 0; | ||
1464 | 1647 | ||
1465 | /* | 1648 | if (root != anon_vma && atomic_dec_and_test(&root->refcount)) |
1466 | * The refcount on a non-root anon_vma got dropped. Drop | 1649 | anon_vma_free(root); |
1467 | * the refcount on the root and check if we need to free it. | ||
1468 | */ | ||
1469 | if (empty && anon_vma != root) { | ||
1470 | BUG_ON(atomic_read(&root->external_refcount) <= 0); | ||
1471 | last_root_user = atomic_dec_and_test(&root->external_refcount); | ||
1472 | root_empty = list_empty(&root->head); | ||
1473 | } | ||
1474 | anon_vma_unlock(anon_vma); | ||
1475 | 1650 | ||
1476 | if (empty) { | 1651 | anon_vma_free(anon_vma); |
1477 | anon_vma_free(anon_vma); | ||
1478 | if (root_empty && last_root_user) | ||
1479 | anon_vma_free(root); | ||
1480 | } | ||
1481 | } | ||
1482 | } | 1652 | } |
1483 | #endif | ||
1484 | 1653 | ||
1485 | #ifdef CONFIG_MIGRATION | 1654 | #ifdef CONFIG_MIGRATION |
1486 | /* | 1655 | /* |
@@ -1528,7 +1697,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | |||
1528 | 1697 | ||
1529 | if (!mapping) | 1698 | if (!mapping) |
1530 | return ret; | 1699 | return ret; |
1531 | spin_lock(&mapping->i_mmap_lock); | 1700 | mutex_lock(&mapping->i_mmap_mutex); |
1532 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1701 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1533 | unsigned long address = vma_address(page, vma); | 1702 | unsigned long address = vma_address(page, vma); |
1534 | if (address == -EFAULT) | 1703 | if (address == -EFAULT) |
@@ -1542,7 +1711,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | |||
1542 | * never contain migration ptes. Decide what to do about this | 1711 | * never contain migration ptes. Decide what to do about this |
1543 | * limitation to linear when we need rmap_walk() on nonlinear. | 1712 | * limitation to linear when we need rmap_walk() on nonlinear. |
1544 | */ | 1713 | */ |
1545 | spin_unlock(&mapping->i_mmap_lock); | 1714 | mutex_unlock(&mapping->i_mmap_mutex); |
1546 | return ret; | 1715 | return ret; |
1547 | } | 1716 | } |
1548 | 1717 | ||
@@ -1591,7 +1760,7 @@ void hugepage_add_anon_rmap(struct page *page, | |||
1591 | 1760 | ||
1592 | BUG_ON(!PageLocked(page)); | 1761 | BUG_ON(!PageLocked(page)); |
1593 | BUG_ON(!anon_vma); | 1762 | BUG_ON(!anon_vma); |
1594 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 1763 | /* address might be in next vma when migration races vma_adjust */ |
1595 | first = atomic_inc_and_test(&page->_mapcount); | 1764 | first = atomic_inc_and_test(&page->_mapcount); |
1596 | if (first) | 1765 | if (first) |
1597 | __hugepage_set_anon_rmap(page, vma, address, 0); | 1766 | __hugepage_set_anon_rmap(page, vma, address, 0); |