diff options
Diffstat (limited to 'mm/rmap.c')
-rw-r--r-- | mm/rmap.c | 172 |
1 files changed, 137 insertions, 35 deletions
@@ -24,8 +24,8 @@ | |||
24 | * inode->i_alloc_sem (vmtruncate_range) | 24 | * inode->i_alloc_sem (vmtruncate_range) |
25 | * mm->mmap_sem | 25 | * mm->mmap_sem |
26 | * page->flags PG_locked (lock_page) | 26 | * page->flags PG_locked (lock_page) |
27 | * mapping->i_mmap_lock | 27 | * mapping->i_mmap_mutex |
28 | * anon_vma->lock | 28 | * anon_vma->mutex |
29 | * mm->page_table_lock or pte_lock | 29 | * mm->page_table_lock or pte_lock |
30 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 30 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
31 | * swap_lock (in swap_duplicate, swap_info_get) | 31 | * swap_lock (in swap_duplicate, swap_info_get) |
@@ -40,7 +40,7 @@ | |||
40 | * | 40 | * |
41 | * (code doesn't rely on that order so it could be switched around) | 41 | * (code doesn't rely on that order so it could be switched around) |
42 | * ->tasklist_lock | 42 | * ->tasklist_lock |
43 | * anon_vma->lock (memory_failure, collect_procs_anon) | 43 | * anon_vma->mutex (memory_failure, collect_procs_anon) |
44 | * pte map lock | 44 | * pte map lock |
45 | */ | 45 | */ |
46 | 46 | ||
@@ -86,6 +86,29 @@ static inline struct anon_vma *anon_vma_alloc(void) | |||
86 | static inline void anon_vma_free(struct anon_vma *anon_vma) | 86 | static inline void anon_vma_free(struct anon_vma *anon_vma) |
87 | { | 87 | { |
88 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); | 88 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); |
89 | |||
90 | /* | ||
91 | * Synchronize against page_lock_anon_vma() such that | ||
92 | * we can safely hold the lock without the anon_vma getting | ||
93 | * freed. | ||
94 | * | ||
95 | * Relies on the full mb implied by the atomic_dec_and_test() from | ||
96 | * put_anon_vma() against the acquire barrier implied by | ||
97 | * mutex_trylock() from page_lock_anon_vma(). This orders: | ||
98 | * | ||
99 | * page_lock_anon_vma() VS put_anon_vma() | ||
100 | * mutex_trylock() atomic_dec_and_test() | ||
101 | * LOCK MB | ||
102 | * atomic_read() mutex_is_locked() | ||
103 | * | ||
104 | * LOCK should suffice since the actual taking of the lock must | ||
105 | * happen _before_ what follows. | ||
106 | */ | ||
107 | if (mutex_is_locked(&anon_vma->root->mutex)) { | ||
108 | anon_vma_lock(anon_vma); | ||
109 | anon_vma_unlock(anon_vma); | ||
110 | } | ||
111 | |||
89 | kmem_cache_free(anon_vma_cachep, anon_vma); | 112 | kmem_cache_free(anon_vma_cachep, anon_vma); |
90 | } | 113 | } |
91 | 114 | ||
@@ -307,7 +330,7 @@ static void anon_vma_ctor(void *data) | |||
307 | { | 330 | { |
308 | struct anon_vma *anon_vma = data; | 331 | struct anon_vma *anon_vma = data; |
309 | 332 | ||
310 | spin_lock_init(&anon_vma->lock); | 333 | mutex_init(&anon_vma->mutex); |
311 | atomic_set(&anon_vma->refcount, 0); | 334 | atomic_set(&anon_vma->refcount, 0); |
312 | INIT_LIST_HEAD(&anon_vma->head); | 335 | INIT_LIST_HEAD(&anon_vma->head); |
313 | } | 336 | } |
@@ -320,12 +343,26 @@ void __init anon_vma_init(void) | |||
320 | } | 343 | } |
321 | 344 | ||
322 | /* | 345 | /* |
323 | * Getting a lock on a stable anon_vma from a page off the LRU is | 346 | * Getting a lock on a stable anon_vma from a page off the LRU is tricky! |
324 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. | 347 | * |
348 | * Since there is no serialization what so ever against page_remove_rmap() | ||
349 | * the best this function can do is return a locked anon_vma that might | ||
350 | * have been relevant to this page. | ||
351 | * | ||
352 | * The page might have been remapped to a different anon_vma or the anon_vma | ||
353 | * returned may already be freed (and even reused). | ||
354 | * | ||
355 | * All users of this function must be very careful when walking the anon_vma | ||
356 | * chain and verify that the page in question is indeed mapped in it | ||
357 | * [ something equivalent to page_mapped_in_vma() ]. | ||
358 | * | ||
359 | * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() | ||
360 | * that the anon_vma pointer from page->mapping is valid if there is a | ||
361 | * mapcount, we can dereference the anon_vma after observing those. | ||
325 | */ | 362 | */ |
326 | struct anon_vma *__page_lock_anon_vma(struct page *page) | 363 | struct anon_vma *page_get_anon_vma(struct page *page) |
327 | { | 364 | { |
328 | struct anon_vma *anon_vma, *root_anon_vma; | 365 | struct anon_vma *anon_vma = NULL; |
329 | unsigned long anon_mapping; | 366 | unsigned long anon_mapping; |
330 | 367 | ||
331 | rcu_read_lock(); | 368 | rcu_read_lock(); |
@@ -336,32 +373,97 @@ struct anon_vma *__page_lock_anon_vma(struct page *page) | |||
336 | goto out; | 373 | goto out; |
337 | 374 | ||
338 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 375 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
339 | root_anon_vma = ACCESS_ONCE(anon_vma->root); | 376 | if (!atomic_inc_not_zero(&anon_vma->refcount)) { |
340 | spin_lock(&root_anon_vma->lock); | 377 | anon_vma = NULL; |
378 | goto out; | ||
379 | } | ||
341 | 380 | ||
342 | /* | 381 | /* |
343 | * If this page is still mapped, then its anon_vma cannot have been | 382 | * If this page is still mapped, then its anon_vma cannot have been |
344 | * freed. But if it has been unmapped, we have no security against | 383 | * freed. But if it has been unmapped, we have no security against the |
345 | * the anon_vma structure being freed and reused (for another anon_vma: | 384 | * anon_vma structure being freed and reused (for another anon_vma: |
346 | * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot | 385 | * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() |
347 | * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting | 386 | * above cannot corrupt). |
348 | * anon_vma->root before page_unlock_anon_vma() is called to unlock. | ||
349 | */ | 387 | */ |
350 | if (page_mapped(page)) | 388 | if (!page_mapped(page)) { |
351 | return anon_vma; | 389 | put_anon_vma(anon_vma); |
390 | anon_vma = NULL; | ||
391 | } | ||
392 | out: | ||
393 | rcu_read_unlock(); | ||
394 | |||
395 | return anon_vma; | ||
396 | } | ||
397 | |||
398 | /* | ||
399 | * Similar to page_get_anon_vma() except it locks the anon_vma. | ||
400 | * | ||
401 | * Its a little more complex as it tries to keep the fast path to a single | ||
402 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a | ||
403 | * reference like with page_get_anon_vma() and then block on the mutex. | ||
404 | */ | ||
405 | struct anon_vma *page_lock_anon_vma(struct page *page) | ||
406 | { | ||
407 | struct anon_vma *anon_vma = NULL; | ||
408 | unsigned long anon_mapping; | ||
409 | |||
410 | rcu_read_lock(); | ||
411 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); | ||
412 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | ||
413 | goto out; | ||
414 | if (!page_mapped(page)) | ||
415 | goto out; | ||
416 | |||
417 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | ||
418 | if (mutex_trylock(&anon_vma->root->mutex)) { | ||
419 | /* | ||
420 | * If we observe a !0 refcount, then holding the lock ensures | ||
421 | * the anon_vma will not go away, see __put_anon_vma(). | ||
422 | */ | ||
423 | if (!atomic_read(&anon_vma->refcount)) { | ||
424 | anon_vma_unlock(anon_vma); | ||
425 | anon_vma = NULL; | ||
426 | } | ||
427 | goto out; | ||
428 | } | ||
429 | |||
430 | /* trylock failed, we got to sleep */ | ||
431 | if (!atomic_inc_not_zero(&anon_vma->refcount)) { | ||
432 | anon_vma = NULL; | ||
433 | goto out; | ||
434 | } | ||
435 | |||
436 | if (!page_mapped(page)) { | ||
437 | put_anon_vma(anon_vma); | ||
438 | anon_vma = NULL; | ||
439 | goto out; | ||
440 | } | ||
441 | |||
442 | /* we pinned the anon_vma, its safe to sleep */ | ||
443 | rcu_read_unlock(); | ||
444 | anon_vma_lock(anon_vma); | ||
445 | |||
446 | if (atomic_dec_and_test(&anon_vma->refcount)) { | ||
447 | /* | ||
448 | * Oops, we held the last refcount, release the lock | ||
449 | * and bail -- can't simply use put_anon_vma() because | ||
450 | * we'll deadlock on the anon_vma_lock() recursion. | ||
451 | */ | ||
452 | anon_vma_unlock(anon_vma); | ||
453 | __put_anon_vma(anon_vma); | ||
454 | anon_vma = NULL; | ||
455 | } | ||
456 | |||
457 | return anon_vma; | ||
352 | 458 | ||
353 | spin_unlock(&root_anon_vma->lock); | ||
354 | out: | 459 | out: |
355 | rcu_read_unlock(); | 460 | rcu_read_unlock(); |
356 | return NULL; | 461 | return anon_vma; |
357 | } | 462 | } |
358 | 463 | ||
359 | void page_unlock_anon_vma(struct anon_vma *anon_vma) | 464 | void page_unlock_anon_vma(struct anon_vma *anon_vma) |
360 | __releases(&anon_vma->root->lock) | ||
361 | __releases(RCU) | ||
362 | { | 465 | { |
363 | anon_vma_unlock(anon_vma); | 466 | anon_vma_unlock(anon_vma); |
364 | rcu_read_unlock(); | ||
365 | } | 467 | } |
366 | 468 | ||
367 | /* | 469 | /* |
@@ -646,14 +748,14 @@ static int page_referenced_file(struct page *page, | |||
646 | * The page lock not only makes sure that page->mapping cannot | 748 | * The page lock not only makes sure that page->mapping cannot |
647 | * suddenly be NULLified by truncation, it makes sure that the | 749 | * suddenly be NULLified by truncation, it makes sure that the |
648 | * structure at mapping cannot be freed and reused yet, | 750 | * structure at mapping cannot be freed and reused yet, |
649 | * so we can safely take mapping->i_mmap_lock. | 751 | * so we can safely take mapping->i_mmap_mutex. |
650 | */ | 752 | */ |
651 | BUG_ON(!PageLocked(page)); | 753 | BUG_ON(!PageLocked(page)); |
652 | 754 | ||
653 | spin_lock(&mapping->i_mmap_lock); | 755 | mutex_lock(&mapping->i_mmap_mutex); |
654 | 756 | ||
655 | /* | 757 | /* |
656 | * i_mmap_lock does not stabilize mapcount at all, but mapcount | 758 | * i_mmap_mutex does not stabilize mapcount at all, but mapcount |
657 | * is more likely to be accurate if we note it after spinning. | 759 | * is more likely to be accurate if we note it after spinning. |
658 | */ | 760 | */ |
659 | mapcount = page_mapcount(page); | 761 | mapcount = page_mapcount(page); |
@@ -675,7 +777,7 @@ static int page_referenced_file(struct page *page, | |||
675 | break; | 777 | break; |
676 | } | 778 | } |
677 | 779 | ||
678 | spin_unlock(&mapping->i_mmap_lock); | 780 | mutex_unlock(&mapping->i_mmap_mutex); |
679 | return referenced; | 781 | return referenced; |
680 | } | 782 | } |
681 | 783 | ||
@@ -762,7 +864,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
762 | 864 | ||
763 | BUG_ON(PageAnon(page)); | 865 | BUG_ON(PageAnon(page)); |
764 | 866 | ||
765 | spin_lock(&mapping->i_mmap_lock); | 867 | mutex_lock(&mapping->i_mmap_mutex); |
766 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 868 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
767 | if (vma->vm_flags & VM_SHARED) { | 869 | if (vma->vm_flags & VM_SHARED) { |
768 | unsigned long address = vma_address(page, vma); | 870 | unsigned long address = vma_address(page, vma); |
@@ -771,7 +873,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
771 | ret += page_mkclean_one(page, vma, address); | 873 | ret += page_mkclean_one(page, vma, address); |
772 | } | 874 | } |
773 | } | 875 | } |
774 | spin_unlock(&mapping->i_mmap_lock); | 876 | mutex_unlock(&mapping->i_mmap_mutex); |
775 | return ret; | 877 | return ret; |
776 | } | 878 | } |
777 | 879 | ||
@@ -1119,7 +1221,7 @@ out_mlock: | |||
1119 | /* | 1221 | /* |
1120 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | 1222 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes |
1121 | * unstable result and race. Plus, We can't wait here because | 1223 | * unstable result and race. Plus, We can't wait here because |
1122 | * we now hold anon_vma->lock or mapping->i_mmap_lock. | 1224 | * we now hold anon_vma->mutex or mapping->i_mmap_mutex. |
1123 | * if trylock failed, the page remain in evictable lru and later | 1225 | * if trylock failed, the page remain in evictable lru and later |
1124 | * vmscan could retry to move the page to unevictable lru if the | 1226 | * vmscan could retry to move the page to unevictable lru if the |
1125 | * page is actually mlocked. | 1227 | * page is actually mlocked. |
@@ -1345,7 +1447,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1345 | unsigned long max_nl_size = 0; | 1447 | unsigned long max_nl_size = 0; |
1346 | unsigned int mapcount; | 1448 | unsigned int mapcount; |
1347 | 1449 | ||
1348 | spin_lock(&mapping->i_mmap_lock); | 1450 | mutex_lock(&mapping->i_mmap_mutex); |
1349 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1451 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1350 | unsigned long address = vma_address(page, vma); | 1452 | unsigned long address = vma_address(page, vma); |
1351 | if (address == -EFAULT) | 1453 | if (address == -EFAULT) |
@@ -1391,7 +1493,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1391 | mapcount = page_mapcount(page); | 1493 | mapcount = page_mapcount(page); |
1392 | if (!mapcount) | 1494 | if (!mapcount) |
1393 | goto out; | 1495 | goto out; |
1394 | cond_resched_lock(&mapping->i_mmap_lock); | 1496 | cond_resched(); |
1395 | 1497 | ||
1396 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; | 1498 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; |
1397 | if (max_nl_cursor == 0) | 1499 | if (max_nl_cursor == 0) |
@@ -1413,7 +1515,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1413 | } | 1515 | } |
1414 | vma->vm_private_data = (void *) max_nl_cursor; | 1516 | vma->vm_private_data = (void *) max_nl_cursor; |
1415 | } | 1517 | } |
1416 | cond_resched_lock(&mapping->i_mmap_lock); | 1518 | cond_resched(); |
1417 | max_nl_cursor += CLUSTER_SIZE; | 1519 | max_nl_cursor += CLUSTER_SIZE; |
1418 | } while (max_nl_cursor <= max_nl_size); | 1520 | } while (max_nl_cursor <= max_nl_size); |
1419 | 1521 | ||
@@ -1425,7 +1527,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1425 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | 1527 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) |
1426 | vma->vm_private_data = NULL; | 1528 | vma->vm_private_data = NULL; |
1427 | out: | 1529 | out: |
1428 | spin_unlock(&mapping->i_mmap_lock); | 1530 | mutex_unlock(&mapping->i_mmap_mutex); |
1429 | return ret; | 1531 | return ret; |
1430 | } | 1532 | } |
1431 | 1533 | ||
@@ -1544,7 +1646,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | |||
1544 | 1646 | ||
1545 | if (!mapping) | 1647 | if (!mapping) |
1546 | return ret; | 1648 | return ret; |
1547 | spin_lock(&mapping->i_mmap_lock); | 1649 | mutex_lock(&mapping->i_mmap_mutex); |
1548 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1650 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
1549 | unsigned long address = vma_address(page, vma); | 1651 | unsigned long address = vma_address(page, vma); |
1550 | if (address == -EFAULT) | 1652 | if (address == -EFAULT) |
@@ -1558,7 +1660,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | |||
1558 | * never contain migration ptes. Decide what to do about this | 1660 | * never contain migration ptes. Decide what to do about this |
1559 | * limitation to linear when we need rmap_walk() on nonlinear. | 1661 | * limitation to linear when we need rmap_walk() on nonlinear. |
1560 | */ | 1662 | */ |
1561 | spin_unlock(&mapping->i_mmap_lock); | 1663 | mutex_unlock(&mapping->i_mmap_mutex); |
1562 | return ret; | 1664 | return ret; |
1563 | } | 1665 | } |
1564 | 1666 | ||