diff options
author | Hugh Dickins <hugh.dickins@tiscali.co.uk> | 2009-12-14 20:59:25 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-15 11:53:19 -0500 |
commit | db114b83ab6064d9b1d6ec5650e096c89bd95e25 (patch) | |
tree | 15e289b25fec011238f6838c6aafa1ff5e293224 /mm | |
parent | 5ad6468801d28c4d4ac9f48ec19297817c915f6a (diff) |
ksm: hold anon_vma in rmap_item
For full functionality, page_referenced_one() and try_to_unmap_one() need
to know the vma: to pass vma down to arch-dependent flushes, or to observe
VM_LOCKED or VM_EXEC. But KSM keeps no record of vma: nor can it, since
vmas get split and merged without its knowledge.
Instead, note page's anon_vma in its rmap_item when adding to stable tree:
all the vmas which might map that page are listed by its anon_vma.
page_referenced_ksm() and try_to_unmap_ksm() then traverse the anon_vma,
first to find the probable vma, that which matches rmap_item's mm; but if
that is not enough to locate all instances, traverse again to try the
others. This catches those occasions when fork has duplicated a pte of a
ksm page, but ksmd has not yet come around to assign it an rmap_item.
But each rmap_item in the stable tree which refers to an anon_vma needs to
take a reference to it. Andrea's anon_vma design cleverly avoided a
reference count (an anon_vma was free when its list of vmas was empty),
but KSM now needs to add that. Is a 32-bit count sufficient? I believe
so - the anon_vma is only free when both count is 0 and list is empty.
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Izik Eidus <ieidus@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Chris Wright <chrisw@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/ksm.c | 157 | ||||
-rw-r--r-- | mm/rmap.c | 5 |
2 files changed, 98 insertions, 64 deletions
@@ -121,7 +121,7 @@ struct stable_node { | |||
121 | /** | 121 | /** |
122 | * struct rmap_item - reverse mapping item for virtual addresses | 122 | * struct rmap_item - reverse mapping item for virtual addresses |
123 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list | 123 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list |
124 | * @filler: unused space we're making available in this patch | 124 | * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree |
125 | * @mm: the memory structure this rmap_item is pointing into | 125 | * @mm: the memory structure this rmap_item is pointing into |
126 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) | 126 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) |
127 | * @oldchecksum: previous checksum of the page at that virtual address | 127 | * @oldchecksum: previous checksum of the page at that virtual address |
@@ -131,7 +131,7 @@ struct stable_node { | |||
131 | */ | 131 | */ |
132 | struct rmap_item { | 132 | struct rmap_item { |
133 | struct rmap_item *rmap_list; | 133 | struct rmap_item *rmap_list; |
134 | unsigned long filler; | 134 | struct anon_vma *anon_vma; /* when stable */ |
135 | struct mm_struct *mm; | 135 | struct mm_struct *mm; |
136 | unsigned long address; /* + low bits used for flags below */ | 136 | unsigned long address; /* + low bits used for flags below */ |
137 | unsigned int oldchecksum; /* when unstable */ | 137 | unsigned int oldchecksum; /* when unstable */ |
@@ -196,13 +196,6 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); | |||
196 | static DEFINE_MUTEX(ksm_thread_mutex); | 196 | static DEFINE_MUTEX(ksm_thread_mutex); |
197 | static DEFINE_SPINLOCK(ksm_mmlist_lock); | 197 | static DEFINE_SPINLOCK(ksm_mmlist_lock); |
198 | 198 | ||
199 | /* | ||
200 | * Temporary hack for page_referenced_ksm() and try_to_unmap_ksm(), | ||
201 | * later we rework things a little to get the right vma to them. | ||
202 | */ | ||
203 | static DEFINE_SPINLOCK(ksm_fallback_vma_lock); | ||
204 | static struct vm_area_struct ksm_fallback_vma; | ||
205 | |||
206 | #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ | 199 | #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ |
207 | sizeof(struct __struct), __alignof__(struct __struct),\ | 200 | sizeof(struct __struct), __alignof__(struct __struct),\ |
208 | (__flags), NULL) | 201 | (__flags), NULL) |
@@ -323,6 +316,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) | |||
323 | return rmap_item->address & STABLE_FLAG; | 316 | return rmap_item->address & STABLE_FLAG; |
324 | } | 317 | } |
325 | 318 | ||
319 | static void hold_anon_vma(struct rmap_item *rmap_item, | ||
320 | struct anon_vma *anon_vma) | ||
321 | { | ||
322 | rmap_item->anon_vma = anon_vma; | ||
323 | atomic_inc(&anon_vma->ksm_refcount); | ||
324 | } | ||
325 | |||
326 | static void drop_anon_vma(struct rmap_item *rmap_item) | ||
327 | { | ||
328 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
329 | |||
330 | if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { | ||
331 | int empty = list_empty(&anon_vma->head); | ||
332 | spin_unlock(&anon_vma->lock); | ||
333 | if (empty) | ||
334 | anon_vma_free(anon_vma); | ||
335 | } | ||
336 | } | ||
337 | |||
326 | /* | 338 | /* |
327 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's | 339 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's |
328 | * page tables after it has passed through ksm_exit() - which, if necessary, | 340 | * page tables after it has passed through ksm_exit() - which, if necessary, |
@@ -472,6 +484,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
472 | ksm_pages_shared--; | 484 | ksm_pages_shared--; |
473 | } | 485 | } |
474 | 486 | ||
487 | drop_anon_vma(rmap_item); | ||
475 | rmap_item->address &= PAGE_MASK; | 488 | rmap_item->address &= PAGE_MASK; |
476 | 489 | ||
477 | } else if (rmap_item->address & UNSTABLE_FLAG) { | 490 | } else if (rmap_item->address & UNSTABLE_FLAG) { |
@@ -752,6 +765,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
752 | pte_t orig_pte = __pte(0); | 765 | pte_t orig_pte = __pte(0); |
753 | int err = -EFAULT; | 766 | int err = -EFAULT; |
754 | 767 | ||
768 | if (page == kpage) /* ksm page forked */ | ||
769 | return 0; | ||
770 | |||
755 | if (!(vma->vm_flags & VM_MERGEABLE)) | 771 | if (!(vma->vm_flags & VM_MERGEABLE)) |
756 | goto out; | 772 | goto out; |
757 | if (!PageAnon(page)) | 773 | if (!PageAnon(page)) |
@@ -805,9 +821,6 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, | |||
805 | struct vm_area_struct *vma; | 821 | struct vm_area_struct *vma; |
806 | int err = -EFAULT; | 822 | int err = -EFAULT; |
807 | 823 | ||
808 | if (page == kpage) /* ksm page forked */ | ||
809 | return 0; | ||
810 | |||
811 | down_read(&mm->mmap_sem); | 824 | down_read(&mm->mmap_sem); |
812 | if (ksm_test_exit(mm)) | 825 | if (ksm_test_exit(mm)) |
813 | goto out; | 826 | goto out; |
@@ -816,6 +829,11 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, | |||
816 | goto out; | 829 | goto out; |
817 | 830 | ||
818 | err = try_to_merge_one_page(vma, page, kpage); | 831 | err = try_to_merge_one_page(vma, page, kpage); |
832 | if (err) | ||
833 | goto out; | ||
834 | |||
835 | /* Must get reference to anon_vma while still holding mmap_sem */ | ||
836 | hold_anon_vma(rmap_item, vma->anon_vma); | ||
819 | out: | 837 | out: |
820 | up_read(&mm->mmap_sem); | 838 | up_read(&mm->mmap_sem); |
821 | return err; | 839 | return err; |
@@ -869,6 +887,11 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, | |||
869 | lru_cache_add_lru(kpage, LRU_ACTIVE_ANON); | 887 | lru_cache_add_lru(kpage, LRU_ACTIVE_ANON); |
870 | 888 | ||
871 | err = try_to_merge_one_page(vma, page, kpage); | 889 | err = try_to_merge_one_page(vma, page, kpage); |
890 | if (err) | ||
891 | goto up; | ||
892 | |||
893 | /* Must get reference to anon_vma while still holding mmap_sem */ | ||
894 | hold_anon_vma(rmap_item, vma->anon_vma); | ||
872 | up: | 895 | up: |
873 | up_read(&mm->mmap_sem); | 896 | up_read(&mm->mmap_sem); |
874 | 897 | ||
@@ -879,8 +902,10 @@ up: | |||
879 | * If that fails, we have a ksm page with only one pte | 902 | * If that fails, we have a ksm page with only one pte |
880 | * pointing to it: so break it. | 903 | * pointing to it: so break it. |
881 | */ | 904 | */ |
882 | if (err) | 905 | if (err) { |
906 | drop_anon_vma(rmap_item); | ||
883 | break_cow(rmap_item); | 907 | break_cow(rmap_item); |
908 | } | ||
884 | } | 909 | } |
885 | if (err) { | 910 | if (err) { |
886 | put_page(kpage); | 911 | put_page(kpage); |
@@ -1155,7 +1180,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1155 | * in which case we need to break_cow on both. | 1180 | * in which case we need to break_cow on both. |
1156 | */ | 1181 | */ |
1157 | if (!stable_node) { | 1182 | if (!stable_node) { |
1183 | drop_anon_vma(tree_rmap_item); | ||
1158 | break_cow(tree_rmap_item); | 1184 | break_cow(tree_rmap_item); |
1185 | drop_anon_vma(rmap_item); | ||
1159 | break_cow(rmap_item); | 1186 | break_cow(rmap_item); |
1160 | } | 1187 | } |
1161 | } | 1188 | } |
@@ -1490,7 +1517,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | |||
1490 | struct hlist_node *hlist; | 1517 | struct hlist_node *hlist; |
1491 | unsigned int mapcount = page_mapcount(page); | 1518 | unsigned int mapcount = page_mapcount(page); |
1492 | int referenced = 0; | 1519 | int referenced = 0; |
1493 | struct vm_area_struct *vma; | 1520 | int search_new_forks = 0; |
1494 | 1521 | ||
1495 | VM_BUG_ON(!PageKsm(page)); | 1522 | VM_BUG_ON(!PageKsm(page)); |
1496 | VM_BUG_ON(!PageLocked(page)); | 1523 | VM_BUG_ON(!PageLocked(page)); |
@@ -1498,36 +1525,40 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | |||
1498 | stable_node = page_stable_node(page); | 1525 | stable_node = page_stable_node(page); |
1499 | if (!stable_node) | 1526 | if (!stable_node) |
1500 | return 0; | 1527 | return 0; |
1501 | 1528 | again: | |
1502 | /* | ||
1503 | * Temporary hack: really we need anon_vma in rmap_item, to | ||
1504 | * provide the correct vma, and to find recently forked instances. | ||
1505 | * Use zalloc to avoid weirdness if any other fields are involved. | ||
1506 | */ | ||
1507 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC); | ||
1508 | if (!vma) { | ||
1509 | spin_lock(&ksm_fallback_vma_lock); | ||
1510 | vma = &ksm_fallback_vma; | ||
1511 | } | ||
1512 | |||
1513 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | 1529 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { |
1514 | if (memcg && !mm_match_cgroup(rmap_item->mm, memcg)) | 1530 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
1515 | continue; | 1531 | struct vm_area_struct *vma; |
1532 | |||
1533 | spin_lock(&anon_vma->lock); | ||
1534 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
1535 | if (rmap_item->address < vma->vm_start || | ||
1536 | rmap_item->address >= vma->vm_end) | ||
1537 | continue; | ||
1538 | /* | ||
1539 | * Initially we examine only the vma which covers this | ||
1540 | * rmap_item; but later, if there is still work to do, | ||
1541 | * we examine covering vmas in other mms: in case they | ||
1542 | * were forked from the original since ksmd passed. | ||
1543 | */ | ||
1544 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1545 | continue; | ||
1516 | 1546 | ||
1517 | vma->vm_mm = rmap_item->mm; | 1547 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) |
1518 | vma->vm_start = rmap_item->address; | 1548 | continue; |
1519 | vma->vm_end = vma->vm_start + PAGE_SIZE; | ||
1520 | 1549 | ||
1521 | referenced += page_referenced_one(page, vma, | 1550 | referenced += page_referenced_one(page, vma, |
1522 | rmap_item->address, &mapcount, vm_flags); | 1551 | rmap_item->address, &mapcount, vm_flags); |
1552 | if (!search_new_forks || !mapcount) | ||
1553 | break; | ||
1554 | } | ||
1555 | spin_unlock(&anon_vma->lock); | ||
1523 | if (!mapcount) | 1556 | if (!mapcount) |
1524 | goto out; | 1557 | goto out; |
1525 | } | 1558 | } |
1559 | if (!search_new_forks++) | ||
1560 | goto again; | ||
1526 | out: | 1561 | out: |
1527 | if (vma == &ksm_fallback_vma) | ||
1528 | spin_unlock(&ksm_fallback_vma_lock); | ||
1529 | else | ||
1530 | kmem_cache_free(vm_area_cachep, vma); | ||
1531 | return referenced; | 1562 | return referenced; |
1532 | } | 1563 | } |
1533 | 1564 | ||
@@ -1537,7 +1568,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | |||
1537 | struct hlist_node *hlist; | 1568 | struct hlist_node *hlist; |
1538 | struct rmap_item *rmap_item; | 1569 | struct rmap_item *rmap_item; |
1539 | int ret = SWAP_AGAIN; | 1570 | int ret = SWAP_AGAIN; |
1540 | struct vm_area_struct *vma; | 1571 | int search_new_forks = 0; |
1541 | 1572 | ||
1542 | VM_BUG_ON(!PageKsm(page)); | 1573 | VM_BUG_ON(!PageKsm(page)); |
1543 | VM_BUG_ON(!PageLocked(page)); | 1574 | VM_BUG_ON(!PageLocked(page)); |
@@ -1545,35 +1576,37 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | |||
1545 | stable_node = page_stable_node(page); | 1576 | stable_node = page_stable_node(page); |
1546 | if (!stable_node) | 1577 | if (!stable_node) |
1547 | return SWAP_FAIL; | 1578 | return SWAP_FAIL; |
1548 | 1579 | again: | |
1549 | /* | ||
1550 | * Temporary hack: really we need anon_vma in rmap_item, to | ||
1551 | * provide the correct vma, and to find recently forked instances. | ||
1552 | * Use zalloc to avoid weirdness if any other fields are involved. | ||
1553 | */ | ||
1554 | if (TTU_ACTION(flags) != TTU_UNMAP) | ||
1555 | return SWAP_FAIL; | ||
1556 | |||
1557 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC); | ||
1558 | if (!vma) { | ||
1559 | spin_lock(&ksm_fallback_vma_lock); | ||
1560 | vma = &ksm_fallback_vma; | ||
1561 | } | ||
1562 | |||
1563 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | 1580 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { |
1564 | vma->vm_mm = rmap_item->mm; | 1581 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
1565 | vma->vm_start = rmap_item->address; | 1582 | struct vm_area_struct *vma; |
1566 | vma->vm_end = vma->vm_start + PAGE_SIZE; | ||
1567 | 1583 | ||
1568 | ret = try_to_unmap_one(page, vma, rmap_item->address, flags); | 1584 | spin_lock(&anon_vma->lock); |
1569 | if (ret != SWAP_AGAIN || !page_mapped(page)) | 1585 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
1570 | goto out; | 1586 | if (rmap_item->address < vma->vm_start || |
1587 | rmap_item->address >= vma->vm_end) | ||
1588 | continue; | ||
1589 | /* | ||
1590 | * Initially we examine only the vma which covers this | ||
1591 | * rmap_item; but later, if there is still work to do, | ||
1592 | * we examine covering vmas in other mms: in case they | ||
1593 | * were forked from the original since ksmd passed. | ||
1594 | */ | ||
1595 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1596 | continue; | ||
1597 | |||
1598 | ret = try_to_unmap_one(page, vma, | ||
1599 | rmap_item->address, flags); | ||
1600 | if (ret != SWAP_AGAIN || !page_mapped(page)) { | ||
1601 | spin_unlock(&anon_vma->lock); | ||
1602 | goto out; | ||
1603 | } | ||
1604 | } | ||
1605 | spin_unlock(&anon_vma->lock); | ||
1571 | } | 1606 | } |
1607 | if (!search_new_forks++) | ||
1608 | goto again; | ||
1572 | out: | 1609 | out: |
1573 | if (vma == &ksm_fallback_vma) | ||
1574 | spin_unlock(&ksm_fallback_vma_lock); | ||
1575 | else | ||
1576 | kmem_cache_free(vm_area_cachep, vma); | ||
1577 | return ret; | 1610 | return ret; |
1578 | } | 1611 | } |
1579 | 1612 | ||
@@ -68,7 +68,7 @@ static inline struct anon_vma *anon_vma_alloc(void) | |||
68 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | 68 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); |
69 | } | 69 | } |
70 | 70 | ||
71 | static inline void anon_vma_free(struct anon_vma *anon_vma) | 71 | void anon_vma_free(struct anon_vma *anon_vma) |
72 | { | 72 | { |
73 | kmem_cache_free(anon_vma_cachep, anon_vma); | 73 | kmem_cache_free(anon_vma_cachep, anon_vma); |
74 | } | 74 | } |
@@ -172,7 +172,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
172 | list_del(&vma->anon_vma_node); | 172 | list_del(&vma->anon_vma_node); |
173 | 173 | ||
174 | /* We must garbage collect the anon_vma if it's empty */ | 174 | /* We must garbage collect the anon_vma if it's empty */ |
175 | empty = list_empty(&anon_vma->head); | 175 | empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); |
176 | spin_unlock(&anon_vma->lock); | 176 | spin_unlock(&anon_vma->lock); |
177 | 177 | ||
178 | if (empty) | 178 | if (empty) |
@@ -184,6 +184,7 @@ static void anon_vma_ctor(void *data) | |||
184 | struct anon_vma *anon_vma = data; | 184 | struct anon_vma *anon_vma = data; |
185 | 185 | ||
186 | spin_lock_init(&anon_vma->lock); | 186 | spin_lock_init(&anon_vma->lock); |
187 | ksm_refcount_init(anon_vma); | ||
187 | INIT_LIST_HEAD(&anon_vma->head); | 188 | INIT_LIST_HEAD(&anon_vma->head); |
188 | } | 189 | } |
189 | 190 | ||