aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorHugh Dickins <hugh.dickins@tiscali.co.uk>2009-12-14 20:59:25 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-15 11:53:19 -0500
commitdb114b83ab6064d9b1d6ec5650e096c89bd95e25 (patch)
tree15e289b25fec011238f6838c6aafa1ff5e293224 /mm
parent5ad6468801d28c4d4ac9f48ec19297817c915f6a (diff)
ksm: hold anon_vma in rmap_item
For full functionality, page_referenced_one() and try_to_unmap_one() need to know the vma: to pass vma down to arch-dependent flushes, or to observe VM_LOCKED or VM_EXEC. But KSM keeps no record of vma: nor can it, since vmas get split and merged without its knowledge. Instead, note page's anon_vma in its rmap_item when adding to stable tree: all the vmas which might map that page are listed by its anon_vma. page_referenced_ksm() and try_to_unmap_ksm() then traverse the anon_vma, first to find the probable vma, that which matches rmap_item's mm; but if that is not enough to locate all instances, traverse again to try the others. This catches those occasions when fork has duplicated a pte of a ksm page, but ksmd has not yet come around to assign it an rmap_item. But each rmap_item in the stable tree which refers to an anon_vma needs to take a reference to it. Andrea's anon_vma design cleverly avoided a reference count (an anon_vma was free when its list of vmas was empty), but KSM now needs to add that. Is a 32-bit count sufficient? I believe so - the anon_vma is only free when both count is 0 and list is empty. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Izik Eidus <ieidus@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Chris Wright <chrisw@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/ksm.c157
-rw-r--r--mm/rmap.c5
2 files changed, 98 insertions, 64 deletions
diff --git a/mm/ksm.c b/mm/ksm.c
index 2f58ceebfe8f..f7d121c42d01 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -121,7 +121,7 @@ struct stable_node {
121/** 121/**
122 * struct rmap_item - reverse mapping item for virtual addresses 122 * struct rmap_item - reverse mapping item for virtual addresses
123 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list 123 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
124 * @filler: unused space we're making available in this patch 124 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
125 * @mm: the memory structure this rmap_item is pointing into 125 * @mm: the memory structure this rmap_item is pointing into
126 * @address: the virtual address this rmap_item tracks (+ flags in low bits) 126 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
127 * @oldchecksum: previous checksum of the page at that virtual address 127 * @oldchecksum: previous checksum of the page at that virtual address
@@ -131,7 +131,7 @@ struct stable_node {
131 */ 131 */
132struct rmap_item { 132struct rmap_item {
133 struct rmap_item *rmap_list; 133 struct rmap_item *rmap_list;
134 unsigned long filler; 134 struct anon_vma *anon_vma; /* when stable */
135 struct mm_struct *mm; 135 struct mm_struct *mm;
136 unsigned long address; /* + low bits used for flags below */ 136 unsigned long address; /* + low bits used for flags below */
137 unsigned int oldchecksum; /* when unstable */ 137 unsigned int oldchecksum; /* when unstable */
@@ -196,13 +196,6 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
196static DEFINE_MUTEX(ksm_thread_mutex); 196static DEFINE_MUTEX(ksm_thread_mutex);
197static DEFINE_SPINLOCK(ksm_mmlist_lock); 197static DEFINE_SPINLOCK(ksm_mmlist_lock);
198 198
199/*
200 * Temporary hack for page_referenced_ksm() and try_to_unmap_ksm(),
201 * later we rework things a little to get the right vma to them.
202 */
203static DEFINE_SPINLOCK(ksm_fallback_vma_lock);
204static struct vm_area_struct ksm_fallback_vma;
205
206#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ 199#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
207 sizeof(struct __struct), __alignof__(struct __struct),\ 200 sizeof(struct __struct), __alignof__(struct __struct),\
208 (__flags), NULL) 201 (__flags), NULL)
@@ -323,6 +316,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
323 return rmap_item->address & STABLE_FLAG; 316 return rmap_item->address & STABLE_FLAG;
324} 317}
325 318
319static void hold_anon_vma(struct rmap_item *rmap_item,
320 struct anon_vma *anon_vma)
321{
322 rmap_item->anon_vma = anon_vma;
323 atomic_inc(&anon_vma->ksm_refcount);
324}
325
326static void drop_anon_vma(struct rmap_item *rmap_item)
327{
328 struct anon_vma *anon_vma = rmap_item->anon_vma;
329
330 if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) {
331 int empty = list_empty(&anon_vma->head);
332 spin_unlock(&anon_vma->lock);
333 if (empty)
334 anon_vma_free(anon_vma);
335 }
336}
337
326/* 338/*
327 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's 339 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
328 * page tables after it has passed through ksm_exit() - which, if necessary, 340 * page tables after it has passed through ksm_exit() - which, if necessary,
@@ -472,6 +484,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
472 ksm_pages_shared--; 484 ksm_pages_shared--;
473 } 485 }
474 486
487 drop_anon_vma(rmap_item);
475 rmap_item->address &= PAGE_MASK; 488 rmap_item->address &= PAGE_MASK;
476 489
477 } else if (rmap_item->address & UNSTABLE_FLAG) { 490 } else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -752,6 +765,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
752 pte_t orig_pte = __pte(0); 765 pte_t orig_pte = __pte(0);
753 int err = -EFAULT; 766 int err = -EFAULT;
754 767
768 if (page == kpage) /* ksm page forked */
769 return 0;
770
755 if (!(vma->vm_flags & VM_MERGEABLE)) 771 if (!(vma->vm_flags & VM_MERGEABLE))
756 goto out; 772 goto out;
757 if (!PageAnon(page)) 773 if (!PageAnon(page))
@@ -805,9 +821,6 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
805 struct vm_area_struct *vma; 821 struct vm_area_struct *vma;
806 int err = -EFAULT; 822 int err = -EFAULT;
807 823
808 if (page == kpage) /* ksm page forked */
809 return 0;
810
811 down_read(&mm->mmap_sem); 824 down_read(&mm->mmap_sem);
812 if (ksm_test_exit(mm)) 825 if (ksm_test_exit(mm))
813 goto out; 826 goto out;
@@ -816,6 +829,11 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
816 goto out; 829 goto out;
817 830
818 err = try_to_merge_one_page(vma, page, kpage); 831 err = try_to_merge_one_page(vma, page, kpage);
832 if (err)
833 goto out;
834
835 /* Must get reference to anon_vma while still holding mmap_sem */
836 hold_anon_vma(rmap_item, vma->anon_vma);
819out: 837out:
820 up_read(&mm->mmap_sem); 838 up_read(&mm->mmap_sem);
821 return err; 839 return err;
@@ -869,6 +887,11 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
869 lru_cache_add_lru(kpage, LRU_ACTIVE_ANON); 887 lru_cache_add_lru(kpage, LRU_ACTIVE_ANON);
870 888
871 err = try_to_merge_one_page(vma, page, kpage); 889 err = try_to_merge_one_page(vma, page, kpage);
890 if (err)
891 goto up;
892
893 /* Must get reference to anon_vma while still holding mmap_sem */
894 hold_anon_vma(rmap_item, vma->anon_vma);
872up: 895up:
873 up_read(&mm->mmap_sem); 896 up_read(&mm->mmap_sem);
874 897
@@ -879,8 +902,10 @@ up:
879 * If that fails, we have a ksm page with only one pte 902 * If that fails, we have a ksm page with only one pte
880 * pointing to it: so break it. 903 * pointing to it: so break it.
881 */ 904 */
882 if (err) 905 if (err) {
906 drop_anon_vma(rmap_item);
883 break_cow(rmap_item); 907 break_cow(rmap_item);
908 }
884 } 909 }
885 if (err) { 910 if (err) {
886 put_page(kpage); 911 put_page(kpage);
@@ -1155,7 +1180,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1155 * in which case we need to break_cow on both. 1180 * in which case we need to break_cow on both.
1156 */ 1181 */
1157 if (!stable_node) { 1182 if (!stable_node) {
1183 drop_anon_vma(tree_rmap_item);
1158 break_cow(tree_rmap_item); 1184 break_cow(tree_rmap_item);
1185 drop_anon_vma(rmap_item);
1159 break_cow(rmap_item); 1186 break_cow(rmap_item);
1160 } 1187 }
1161 } 1188 }
@@ -1490,7 +1517,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
1490 struct hlist_node *hlist; 1517 struct hlist_node *hlist;
1491 unsigned int mapcount = page_mapcount(page); 1518 unsigned int mapcount = page_mapcount(page);
1492 int referenced = 0; 1519 int referenced = 0;
1493 struct vm_area_struct *vma; 1520 int search_new_forks = 0;
1494 1521
1495 VM_BUG_ON(!PageKsm(page)); 1522 VM_BUG_ON(!PageKsm(page));
1496 VM_BUG_ON(!PageLocked(page)); 1523 VM_BUG_ON(!PageLocked(page));
@@ -1498,36 +1525,40 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
1498 stable_node = page_stable_node(page); 1525 stable_node = page_stable_node(page);
1499 if (!stable_node) 1526 if (!stable_node)
1500 return 0; 1527 return 0;
1501 1528again:
1502 /*
1503 * Temporary hack: really we need anon_vma in rmap_item, to
1504 * provide the correct vma, and to find recently forked instances.
1505 * Use zalloc to avoid weirdness if any other fields are involved.
1506 */
1507 vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC);
1508 if (!vma) {
1509 spin_lock(&ksm_fallback_vma_lock);
1510 vma = &ksm_fallback_vma;
1511 }
1512
1513 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1529 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1514 if (memcg && !mm_match_cgroup(rmap_item->mm, memcg)) 1530 struct anon_vma *anon_vma = rmap_item->anon_vma;
1515 continue; 1531 struct vm_area_struct *vma;
1532
1533 spin_lock(&anon_vma->lock);
1534 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1535 if (rmap_item->address < vma->vm_start ||
1536 rmap_item->address >= vma->vm_end)
1537 continue;
1538 /*
1539 * Initially we examine only the vma which covers this
1540 * rmap_item; but later, if there is still work to do,
1541 * we examine covering vmas in other mms: in case they
1542 * were forked from the original since ksmd passed.
1543 */
1544 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1545 continue;
1516 1546
1517 vma->vm_mm = rmap_item->mm; 1547 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
1518 vma->vm_start = rmap_item->address; 1548 continue;
1519 vma->vm_end = vma->vm_start + PAGE_SIZE;
1520 1549
1521 referenced += page_referenced_one(page, vma, 1550 referenced += page_referenced_one(page, vma,
1522 rmap_item->address, &mapcount, vm_flags); 1551 rmap_item->address, &mapcount, vm_flags);
1552 if (!search_new_forks || !mapcount)
1553 break;
1554 }
1555 spin_unlock(&anon_vma->lock);
1523 if (!mapcount) 1556 if (!mapcount)
1524 goto out; 1557 goto out;
1525 } 1558 }
1559 if (!search_new_forks++)
1560 goto again;
1526out: 1561out:
1527 if (vma == &ksm_fallback_vma)
1528 spin_unlock(&ksm_fallback_vma_lock);
1529 else
1530 kmem_cache_free(vm_area_cachep, vma);
1531 return referenced; 1562 return referenced;
1532} 1563}
1533 1564
@@ -1537,7 +1568,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1537 struct hlist_node *hlist; 1568 struct hlist_node *hlist;
1538 struct rmap_item *rmap_item; 1569 struct rmap_item *rmap_item;
1539 int ret = SWAP_AGAIN; 1570 int ret = SWAP_AGAIN;
1540 struct vm_area_struct *vma; 1571 int search_new_forks = 0;
1541 1572
1542 VM_BUG_ON(!PageKsm(page)); 1573 VM_BUG_ON(!PageKsm(page));
1543 VM_BUG_ON(!PageLocked(page)); 1574 VM_BUG_ON(!PageLocked(page));
@@ -1545,35 +1576,37 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1545 stable_node = page_stable_node(page); 1576 stable_node = page_stable_node(page);
1546 if (!stable_node) 1577 if (!stable_node)
1547 return SWAP_FAIL; 1578 return SWAP_FAIL;
1548 1579again:
1549 /*
1550 * Temporary hack: really we need anon_vma in rmap_item, to
1551 * provide the correct vma, and to find recently forked instances.
1552 * Use zalloc to avoid weirdness if any other fields are involved.
1553 */
1554 if (TTU_ACTION(flags) != TTU_UNMAP)
1555 return SWAP_FAIL;
1556
1557 vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC);
1558 if (!vma) {
1559 spin_lock(&ksm_fallback_vma_lock);
1560 vma = &ksm_fallback_vma;
1561 }
1562
1563 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { 1580 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1564 vma->vm_mm = rmap_item->mm; 1581 struct anon_vma *anon_vma = rmap_item->anon_vma;
1565 vma->vm_start = rmap_item->address; 1582 struct vm_area_struct *vma;
1566 vma->vm_end = vma->vm_start + PAGE_SIZE;
1567 1583
1568 ret = try_to_unmap_one(page, vma, rmap_item->address, flags); 1584 spin_lock(&anon_vma->lock);
1569 if (ret != SWAP_AGAIN || !page_mapped(page)) 1585 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1570 goto out; 1586 if (rmap_item->address < vma->vm_start ||
1587 rmap_item->address >= vma->vm_end)
1588 continue;
1589 /*
1590 * Initially we examine only the vma which covers this
1591 * rmap_item; but later, if there is still work to do,
1592 * we examine covering vmas in other mms: in case they
1593 * were forked from the original since ksmd passed.
1594 */
1595 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1596 continue;
1597
1598 ret = try_to_unmap_one(page, vma,
1599 rmap_item->address, flags);
1600 if (ret != SWAP_AGAIN || !page_mapped(page)) {
1601 spin_unlock(&anon_vma->lock);
1602 goto out;
1603 }
1604 }
1605 spin_unlock(&anon_vma->lock);
1571 } 1606 }
1607 if (!search_new_forks++)
1608 goto again;
1572out: 1609out:
1573 if (vma == &ksm_fallback_vma)
1574 spin_unlock(&ksm_fallback_vma_lock);
1575 else
1576 kmem_cache_free(vm_area_cachep, vma);
1577 return ret; 1610 return ret;
1578} 1611}
1579 1612
diff --git a/mm/rmap.c b/mm/rmap.c
index 869aaa3206a2..ebdf582ef185 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -68,7 +68,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
68 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 68 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
69} 69}
70 70
71static inline void anon_vma_free(struct anon_vma *anon_vma) 71void anon_vma_free(struct anon_vma *anon_vma)
72{ 72{
73 kmem_cache_free(anon_vma_cachep, anon_vma); 73 kmem_cache_free(anon_vma_cachep, anon_vma);
74} 74}
@@ -172,7 +172,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
172 list_del(&vma->anon_vma_node); 172 list_del(&vma->anon_vma_node);
173 173
174 /* We must garbage collect the anon_vma if it's empty */ 174 /* We must garbage collect the anon_vma if it's empty */
175 empty = list_empty(&anon_vma->head); 175 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
176 spin_unlock(&anon_vma->lock); 176 spin_unlock(&anon_vma->lock);
177 177
178 if (empty) 178 if (empty)
@@ -184,6 +184,7 @@ static void anon_vma_ctor(void *data)
184 struct anon_vma *anon_vma = data; 184 struct anon_vma *anon_vma = data;
185 185
186 spin_lock_init(&anon_vma->lock); 186 spin_lock_init(&anon_vma->lock);
187 ksm_refcount_init(anon_vma);
187 INIT_LIST_HEAD(&anon_vma->head); 188 INIT_LIST_HEAD(&anon_vma->head);
188} 189}
189 190