aboutsummaryrefslogtreecommitdiffstats
path: root/mm/ksm.c
diff options
context:
space:
mode:
authorHugh Dickins <hugh.dickins@tiscali.co.uk>2009-12-14 20:59:27 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-15 11:53:19 -0500
commit4035c07a895974d0ac06a56fe870ad293fc451a7 (patch)
treeb0cc7cabeb0b23d97d5bd5080836d57bc5ee4689 /mm/ksm.c
parentdb114b83ab6064d9b1d6ec5650e096c89bd95e25 (diff)
ksm: take keyhole reference to page
There's a lamentable flaw in KSM swapping: the stable_node holds a reference to the ksm page, so the page to be freed cannot actually be freed until ksmd works its way around to removing the last rmap_item from its stable_node. Which in some configurations may take minutes: not quite responsive enough for memory reclaim. And we don't want to twist KSM and its locking more tightly into the rest of mm. What a pity. But although the stable_node needs to hold a pointer to the ksm page, does it actually need to raise the reference count of that page? No. It would need to do so if struct pages were ordinary kmalloc'ed objects; but they are more stable than that, and reused in particular ways according to particular rules. Access to stable_node from its pointer in struct page is no problem, so long as we never free a stable_node before the ksm page itself has been freed. Access to struct page from its pointer in stable_node: reintroduce get_ksm_page(), and let that peep out through its keyhole (the stable_node pointer to ksm page), to see if that struct page still holds the right key to open it (the ksm page mapping pointer back to this stable_node). This relies upon the established way in which free_hot_cold_page() sets an anon (including ksm) page->mapping to NULL; and relies upon no other user of a struct page to put something which looks like the original stable_node pointer (with two low bits also set) into page->mapping. It also needs get_page_unless_zero() technique pioneered by speculative pagecache; and uses rcu_read_lock() to keep the guarantees that gives. There are several drivers which put pointers of their own into page-> mapping; but none of those could coincide with our stable_node pointers, since KSM won't free a stable_node until it sees that the page has gone. The only problem case found is the pagetable spinlock USE_SPLIT_PTLOCKS places in struct page (my own abuse): to accommodate GENERIC_LOCKBREAK's break_lock on 32-bit, that spans both page->private and page->mapping. Since break_lock is only 0 or 1, again no confusion for get_ksm_page(). But what of DEBUG_SPINLOCK on 64-bit bigendian? When owner_cpu is 3 (matching PageKsm low bits), it might see 0xdead4ead00000003 in page-> mapping, which might coincide? We could get around that by... but a better answer is to suppress USE_SPLIT_PTLOCKS when DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, to stop bloating sizeof(struct page) in their case - already proposed in an earlier mm/Kconfig patch. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Izik Eidus <ieidus@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Chris Wright <chrisw@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/ksm.c')
-rw-r--r--mm/ksm.c149
1 files changed, 110 insertions, 39 deletions
diff --git a/mm/ksm.c b/mm/ksm.c
index f7d121c42d01..37cc92f83a8d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -413,6 +413,12 @@ static void break_cow(struct rmap_item *rmap_item)
413 unsigned long addr = rmap_item->address; 413 unsigned long addr = rmap_item->address;
414 struct vm_area_struct *vma; 414 struct vm_area_struct *vma;
415 415
416 /*
417 * It is not an accident that whenever we want to break COW
418 * to undo, we also need to drop a reference to the anon_vma.
419 */
420 drop_anon_vma(rmap_item);
421
416 down_read(&mm->mmap_sem); 422 down_read(&mm->mmap_sem);
417 if (ksm_test_exit(mm)) 423 if (ksm_test_exit(mm))
418 goto out; 424 goto out;
@@ -456,6 +462,79 @@ out: page = NULL;
456 return page; 462 return page;
457} 463}
458 464
465static void remove_node_from_stable_tree(struct stable_node *stable_node)
466{
467 struct rmap_item *rmap_item;
468 struct hlist_node *hlist;
469
470 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
471 if (rmap_item->hlist.next)
472 ksm_pages_sharing--;
473 else
474 ksm_pages_shared--;
475 drop_anon_vma(rmap_item);
476 rmap_item->address &= PAGE_MASK;
477 cond_resched();
478 }
479
480 rb_erase(&stable_node->node, &root_stable_tree);
481 free_stable_node(stable_node);
482}
483
484/*
485 * get_ksm_page: checks if the page indicated by the stable node
486 * is still its ksm page, despite having held no reference to it.
487 * In which case we can trust the content of the page, and it
488 * returns the gotten page; but if the page has now been zapped,
489 * remove the stale node from the stable tree and return NULL.
490 *
491 * You would expect the stable_node to hold a reference to the ksm page.
492 * But if it increments the page's count, swapping out has to wait for
493 * ksmd to come around again before it can free the page, which may take
494 * seconds or even minutes: much too unresponsive. So instead we use a
495 * "keyhole reference": access to the ksm page from the stable node peeps
496 * out through its keyhole to see if that page still holds the right key,
497 * pointing back to this stable node. This relies on freeing a PageAnon
498 * page to reset its page->mapping to NULL, and relies on no other use of
499 * a page to put something that might look like our key in page->mapping.
500 *
501 * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
502 * but this is different - made simpler by ksm_thread_mutex being held, but
503 * interesting for assuming that no other use of the struct page could ever
504 * put our expected_mapping into page->mapping (or a field of the union which
505 * coincides with page->mapping). The RCU calls are not for KSM at all, but
506 * to keep the page_count protocol described with page_cache_get_speculative.
507 *
508 * Note: it is possible that get_ksm_page() will return NULL one moment,
509 * then page the next, if the page is in between page_freeze_refs() and
510 * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
511 * is on its way to being freed; but it is an anomaly to bear in mind.
512 */
513static struct page *get_ksm_page(struct stable_node *stable_node)
514{
515 struct page *page;
516 void *expected_mapping;
517
518 page = stable_node->page;
519 expected_mapping = (void *)stable_node +
520 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
521 rcu_read_lock();
522 if (page->mapping != expected_mapping)
523 goto stale;
524 if (!get_page_unless_zero(page))
525 goto stale;
526 if (page->mapping != expected_mapping) {
527 put_page(page);
528 goto stale;
529 }
530 rcu_read_unlock();
531 return page;
532stale:
533 rcu_read_unlock();
534 remove_node_from_stable_tree(stable_node);
535 return NULL;
536}
537
459/* 538/*
460 * Removing rmap_item from stable or unstable tree. 539 * Removing rmap_item from stable or unstable tree.
461 * This function will clean the information from the stable/unstable tree. 540 * This function will clean the information from the stable/unstable tree.
@@ -467,22 +546,19 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
467 struct page *page; 546 struct page *page;
468 547
469 stable_node = rmap_item->head; 548 stable_node = rmap_item->head;
470 page = stable_node->page; 549 page = get_ksm_page(stable_node);
471 lock_page(page); 550 if (!page)
551 goto out;
472 552
553 lock_page(page);
473 hlist_del(&rmap_item->hlist); 554 hlist_del(&rmap_item->hlist);
474 if (stable_node->hlist.first) { 555 unlock_page(page);
475 unlock_page(page); 556 put_page(page);
476 ksm_pages_sharing--;
477 } else {
478 set_page_stable_node(page, NULL);
479 unlock_page(page);
480 put_page(page);
481 557
482 rb_erase(&stable_node->node, &root_stable_tree); 558 if (stable_node->hlist.first)
483 free_stable_node(stable_node); 559 ksm_pages_sharing--;
560 else
484 ksm_pages_shared--; 561 ksm_pages_shared--;
485 }
486 562
487 drop_anon_vma(rmap_item); 563 drop_anon_vma(rmap_item);
488 rmap_item->address &= PAGE_MASK; 564 rmap_item->address &= PAGE_MASK;
@@ -504,7 +580,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
504 ksm_pages_unshared--; 580 ksm_pages_unshared--;
505 rmap_item->address &= PAGE_MASK; 581 rmap_item->address &= PAGE_MASK;
506 } 582 }
507 583out:
508 cond_resched(); /* we're called from many long loops */ 584 cond_resched(); /* we're called from many long loops */
509} 585}
510 586
@@ -902,10 +978,8 @@ up:
902 * If that fails, we have a ksm page with only one pte 978 * If that fails, we have a ksm page with only one pte
903 * pointing to it: so break it. 979 * pointing to it: so break it.
904 */ 980 */
905 if (err) { 981 if (err)
906 drop_anon_vma(rmap_item);
907 break_cow(rmap_item); 982 break_cow(rmap_item);
908 }
909 } 983 }
910 if (err) { 984 if (err) {
911 put_page(kpage); 985 put_page(kpage);
@@ -935,21 +1009,25 @@ static struct stable_node *stable_tree_search(struct page *page)
935 } 1009 }
936 1010
937 while (node) { 1011 while (node) {
1012 struct page *tree_page;
938 int ret; 1013 int ret;
939 1014
940 cond_resched(); 1015 cond_resched();
941 stable_node = rb_entry(node, struct stable_node, node); 1016 stable_node = rb_entry(node, struct stable_node, node);
1017 tree_page = get_ksm_page(stable_node);
1018 if (!tree_page)
1019 return NULL;
942 1020
943 ret = memcmp_pages(page, stable_node->page); 1021 ret = memcmp_pages(page, tree_page);
944 1022
945 if (ret < 0) 1023 if (ret < 0) {
1024 put_page(tree_page);
946 node = node->rb_left; 1025 node = node->rb_left;
947 else if (ret > 0) 1026 } else if (ret > 0) {
1027 put_page(tree_page);
948 node = node->rb_right; 1028 node = node->rb_right;
949 else { 1029 } else
950 get_page(stable_node->page);
951 return stable_node; 1030 return stable_node;
952 }
953 } 1031 }
954 1032
955 return NULL; 1033 return NULL;
@@ -969,12 +1047,17 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
969 struct stable_node *stable_node; 1047 struct stable_node *stable_node;
970 1048
971 while (*new) { 1049 while (*new) {
1050 struct page *tree_page;
972 int ret; 1051 int ret;
973 1052
974 cond_resched(); 1053 cond_resched();
975 stable_node = rb_entry(*new, struct stable_node, node); 1054 stable_node = rb_entry(*new, struct stable_node, node);
1055 tree_page = get_ksm_page(stable_node);
1056 if (!tree_page)
1057 return NULL;
976 1058
977 ret = memcmp_pages(kpage, stable_node->page); 1059 ret = memcmp_pages(kpage, tree_page);
1060 put_page(tree_page);
978 1061
979 parent = *new; 1062 parent = *new;
980 if (ret < 0) 1063 if (ret < 0)
@@ -1000,7 +1083,6 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
1000 1083
1001 INIT_HLIST_HEAD(&stable_node->hlist); 1084 INIT_HLIST_HEAD(&stable_node->hlist);
1002 1085
1003 get_page(kpage);
1004 stable_node->page = kpage; 1086 stable_node->page = kpage;
1005 set_page_stable_node(kpage, stable_node); 1087 set_page_stable_node(kpage, stable_node);
1006 1088
@@ -1130,19 +1212,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1130 } 1212 }
1131 1213
1132 /* 1214 /*
1133 * A ksm page might have got here by fork, but its other 1215 * If the hash value of the page has changed from the last time
1134 * references have already been removed from the stable tree. 1216 * we calculated it, this page is changing frequently: therefore we
1135 * Or it might be left over from a break_ksm which failed 1217 * don't want to insert it in the unstable tree, and we don't want
1136 * when the mem_cgroup had reached its limit: try again now. 1218 * to waste our time searching for something identical to it there.
1137 */
1138 if (PageKsm(page))
1139 break_cow(rmap_item);
1140
1141 /*
1142 * In case the hash value of the page was changed from the last time we
1143 * have calculated it, this page to be changed frequely, therefore we
1144 * don't want to insert it to the unstable tree, and we don't want to
1145 * waste our time to search if there is something identical to it there.
1146 */ 1219 */
1147 checksum = calc_checksum(page); 1220 checksum = calc_checksum(page);
1148 if (rmap_item->oldchecksum != checksum) { 1221 if (rmap_item->oldchecksum != checksum) {
@@ -1180,9 +1253,7 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1180 * in which case we need to break_cow on both. 1253 * in which case we need to break_cow on both.
1181 */ 1254 */
1182 if (!stable_node) { 1255 if (!stable_node) {
1183 drop_anon_vma(tree_rmap_item);
1184 break_cow(tree_rmap_item); 1256 break_cow(tree_rmap_item);
1185 drop_anon_vma(rmap_item);
1186 break_cow(rmap_item); 1257 break_cow(rmap_item);
1187 } 1258 }
1188 } 1259 }