diff options
Diffstat (limited to 'mm/ksm.c')
-rw-r--r-- | mm/ksm.c | 149 |
1 files changed, 110 insertions, 39 deletions
@@ -413,6 +413,12 @@ static void break_cow(struct rmap_item *rmap_item) | |||
413 | unsigned long addr = rmap_item->address; | 413 | unsigned long addr = rmap_item->address; |
414 | struct vm_area_struct *vma; | 414 | struct vm_area_struct *vma; |
415 | 415 | ||
416 | /* | ||
417 | * It is not an accident that whenever we want to break COW | ||
418 | * to undo, we also need to drop a reference to the anon_vma. | ||
419 | */ | ||
420 | drop_anon_vma(rmap_item); | ||
421 | |||
416 | down_read(&mm->mmap_sem); | 422 | down_read(&mm->mmap_sem); |
417 | if (ksm_test_exit(mm)) | 423 | if (ksm_test_exit(mm)) |
418 | goto out; | 424 | goto out; |
@@ -456,6 +462,79 @@ out: page = NULL; | |||
456 | return page; | 462 | return page; |
457 | } | 463 | } |
458 | 464 | ||
465 | static void remove_node_from_stable_tree(struct stable_node *stable_node) | ||
466 | { | ||
467 | struct rmap_item *rmap_item; | ||
468 | struct hlist_node *hlist; | ||
469 | |||
470 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
471 | if (rmap_item->hlist.next) | ||
472 | ksm_pages_sharing--; | ||
473 | else | ||
474 | ksm_pages_shared--; | ||
475 | drop_anon_vma(rmap_item); | ||
476 | rmap_item->address &= PAGE_MASK; | ||
477 | cond_resched(); | ||
478 | } | ||
479 | |||
480 | rb_erase(&stable_node->node, &root_stable_tree); | ||
481 | free_stable_node(stable_node); | ||
482 | } | ||
483 | |||
484 | /* | ||
485 | * get_ksm_page: checks if the page indicated by the stable node | ||
486 | * is still its ksm page, despite having held no reference to it. | ||
487 | * In which case we can trust the content of the page, and it | ||
488 | * returns the gotten page; but if the page has now been zapped, | ||
489 | * remove the stale node from the stable tree and return NULL. | ||
490 | * | ||
491 | * You would expect the stable_node to hold a reference to the ksm page. | ||
492 | * But if it increments the page's count, swapping out has to wait for | ||
493 | * ksmd to come around again before it can free the page, which may take | ||
494 | * seconds or even minutes: much too unresponsive. So instead we use a | ||
495 | * "keyhole reference": access to the ksm page from the stable node peeps | ||
496 | * out through its keyhole to see if that page still holds the right key, | ||
497 | * pointing back to this stable node. This relies on freeing a PageAnon | ||
498 | * page to reset its page->mapping to NULL, and relies on no other use of | ||
499 | * a page to put something that might look like our key in page->mapping. | ||
500 | * | ||
501 | * include/linux/pagemap.h page_cache_get_speculative() is a good reference, | ||
502 | * but this is different - made simpler by ksm_thread_mutex being held, but | ||
503 | * interesting for assuming that no other use of the struct page could ever | ||
504 | * put our expected_mapping into page->mapping (or a field of the union which | ||
505 | * coincides with page->mapping). The RCU calls are not for KSM at all, but | ||
506 | * to keep the page_count protocol described with page_cache_get_speculative. | ||
507 | * | ||
508 | * Note: it is possible that get_ksm_page() will return NULL one moment, | ||
509 | * then page the next, if the page is in between page_freeze_refs() and | ||
510 | * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page | ||
511 | * is on its way to being freed; but it is an anomaly to bear in mind. | ||
512 | */ | ||
513 | static struct page *get_ksm_page(struct stable_node *stable_node) | ||
514 | { | ||
515 | struct page *page; | ||
516 | void *expected_mapping; | ||
517 | |||
518 | page = stable_node->page; | ||
519 | expected_mapping = (void *)stable_node + | ||
520 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | ||
521 | rcu_read_lock(); | ||
522 | if (page->mapping != expected_mapping) | ||
523 | goto stale; | ||
524 | if (!get_page_unless_zero(page)) | ||
525 | goto stale; | ||
526 | if (page->mapping != expected_mapping) { | ||
527 | put_page(page); | ||
528 | goto stale; | ||
529 | } | ||
530 | rcu_read_unlock(); | ||
531 | return page; | ||
532 | stale: | ||
533 | rcu_read_unlock(); | ||
534 | remove_node_from_stable_tree(stable_node); | ||
535 | return NULL; | ||
536 | } | ||
537 | |||
459 | /* | 538 | /* |
460 | * Removing rmap_item from stable or unstable tree. | 539 | * Removing rmap_item from stable or unstable tree. |
461 | * This function will clean the information from the stable/unstable tree. | 540 | * This function will clean the information from the stable/unstable tree. |
@@ -467,22 +546,19 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
467 | struct page *page; | 546 | struct page *page; |
468 | 547 | ||
469 | stable_node = rmap_item->head; | 548 | stable_node = rmap_item->head; |
470 | page = stable_node->page; | 549 | page = get_ksm_page(stable_node); |
471 | lock_page(page); | 550 | if (!page) |
551 | goto out; | ||
472 | 552 | ||
553 | lock_page(page); | ||
473 | hlist_del(&rmap_item->hlist); | 554 | hlist_del(&rmap_item->hlist); |
474 | if (stable_node->hlist.first) { | 555 | unlock_page(page); |
475 | unlock_page(page); | 556 | put_page(page); |
476 | ksm_pages_sharing--; | ||
477 | } else { | ||
478 | set_page_stable_node(page, NULL); | ||
479 | unlock_page(page); | ||
480 | put_page(page); | ||
481 | 557 | ||
482 | rb_erase(&stable_node->node, &root_stable_tree); | 558 | if (stable_node->hlist.first) |
483 | free_stable_node(stable_node); | 559 | ksm_pages_sharing--; |
560 | else | ||
484 | ksm_pages_shared--; | 561 | ksm_pages_shared--; |
485 | } | ||
486 | 562 | ||
487 | drop_anon_vma(rmap_item); | 563 | drop_anon_vma(rmap_item); |
488 | rmap_item->address &= PAGE_MASK; | 564 | rmap_item->address &= PAGE_MASK; |
@@ -504,7 +580,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
504 | ksm_pages_unshared--; | 580 | ksm_pages_unshared--; |
505 | rmap_item->address &= PAGE_MASK; | 581 | rmap_item->address &= PAGE_MASK; |
506 | } | 582 | } |
507 | 583 | out: | |
508 | cond_resched(); /* we're called from many long loops */ | 584 | cond_resched(); /* we're called from many long loops */ |
509 | } | 585 | } |
510 | 586 | ||
@@ -902,10 +978,8 @@ up: | |||
902 | * If that fails, we have a ksm page with only one pte | 978 | * If that fails, we have a ksm page with only one pte |
903 | * pointing to it: so break it. | 979 | * pointing to it: so break it. |
904 | */ | 980 | */ |
905 | if (err) { | 981 | if (err) |
906 | drop_anon_vma(rmap_item); | ||
907 | break_cow(rmap_item); | 982 | break_cow(rmap_item); |
908 | } | ||
909 | } | 983 | } |
910 | if (err) { | 984 | if (err) { |
911 | put_page(kpage); | 985 | put_page(kpage); |
@@ -935,21 +1009,25 @@ static struct stable_node *stable_tree_search(struct page *page) | |||
935 | } | 1009 | } |
936 | 1010 | ||
937 | while (node) { | 1011 | while (node) { |
1012 | struct page *tree_page; | ||
938 | int ret; | 1013 | int ret; |
939 | 1014 | ||
940 | cond_resched(); | 1015 | cond_resched(); |
941 | stable_node = rb_entry(node, struct stable_node, node); | 1016 | stable_node = rb_entry(node, struct stable_node, node); |
1017 | tree_page = get_ksm_page(stable_node); | ||
1018 | if (!tree_page) | ||
1019 | return NULL; | ||
942 | 1020 | ||
943 | ret = memcmp_pages(page, stable_node->page); | 1021 | ret = memcmp_pages(page, tree_page); |
944 | 1022 | ||
945 | if (ret < 0) | 1023 | if (ret < 0) { |
1024 | put_page(tree_page); | ||
946 | node = node->rb_left; | 1025 | node = node->rb_left; |
947 | else if (ret > 0) | 1026 | } else if (ret > 0) { |
1027 | put_page(tree_page); | ||
948 | node = node->rb_right; | 1028 | node = node->rb_right; |
949 | else { | 1029 | } else |
950 | get_page(stable_node->page); | ||
951 | return stable_node; | 1030 | return stable_node; |
952 | } | ||
953 | } | 1031 | } |
954 | 1032 | ||
955 | return NULL; | 1033 | return NULL; |
@@ -969,12 +1047,17 @@ static struct stable_node *stable_tree_insert(struct page *kpage) | |||
969 | struct stable_node *stable_node; | 1047 | struct stable_node *stable_node; |
970 | 1048 | ||
971 | while (*new) { | 1049 | while (*new) { |
1050 | struct page *tree_page; | ||
972 | int ret; | 1051 | int ret; |
973 | 1052 | ||
974 | cond_resched(); | 1053 | cond_resched(); |
975 | stable_node = rb_entry(*new, struct stable_node, node); | 1054 | stable_node = rb_entry(*new, struct stable_node, node); |
1055 | tree_page = get_ksm_page(stable_node); | ||
1056 | if (!tree_page) | ||
1057 | return NULL; | ||
976 | 1058 | ||
977 | ret = memcmp_pages(kpage, stable_node->page); | 1059 | ret = memcmp_pages(kpage, tree_page); |
1060 | put_page(tree_page); | ||
978 | 1061 | ||
979 | parent = *new; | 1062 | parent = *new; |
980 | if (ret < 0) | 1063 | if (ret < 0) |
@@ -1000,7 +1083,6 @@ static struct stable_node *stable_tree_insert(struct page *kpage) | |||
1000 | 1083 | ||
1001 | INIT_HLIST_HEAD(&stable_node->hlist); | 1084 | INIT_HLIST_HEAD(&stable_node->hlist); |
1002 | 1085 | ||
1003 | get_page(kpage); | ||
1004 | stable_node->page = kpage; | 1086 | stable_node->page = kpage; |
1005 | set_page_stable_node(kpage, stable_node); | 1087 | set_page_stable_node(kpage, stable_node); |
1006 | 1088 | ||
@@ -1130,19 +1212,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1130 | } | 1212 | } |
1131 | 1213 | ||
1132 | /* | 1214 | /* |
1133 | * A ksm page might have got here by fork, but its other | 1215 | * If the hash value of the page has changed from the last time |
1134 | * references have already been removed from the stable tree. | 1216 | * we calculated it, this page is changing frequently: therefore we |
1135 | * Or it might be left over from a break_ksm which failed | 1217 | * don't want to insert it in the unstable tree, and we don't want |
1136 | * when the mem_cgroup had reached its limit: try again now. | 1218 | * to waste our time searching for something identical to it there. |
1137 | */ | ||
1138 | if (PageKsm(page)) | ||
1139 | break_cow(rmap_item); | ||
1140 | |||
1141 | /* | ||
1142 | * In case the hash value of the page was changed from the last time we | ||
1143 | * have calculated it, this page to be changed frequely, therefore we | ||
1144 | * don't want to insert it to the unstable tree, and we don't want to | ||
1145 | * waste our time to search if there is something identical to it there. | ||
1146 | */ | 1219 | */ |
1147 | checksum = calc_checksum(page); | 1220 | checksum = calc_checksum(page); |
1148 | if (rmap_item->oldchecksum != checksum) { | 1221 | if (rmap_item->oldchecksum != checksum) { |
@@ -1180,9 +1253,7 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1180 | * in which case we need to break_cow on both. | 1253 | * in which case we need to break_cow on both. |
1181 | */ | 1254 | */ |
1182 | if (!stable_node) { | 1255 | if (!stable_node) { |
1183 | drop_anon_vma(tree_rmap_item); | ||
1184 | break_cow(tree_rmap_item); | 1256 | break_cow(tree_rmap_item); |
1185 | drop_anon_vma(rmap_item); | ||
1186 | break_cow(rmap_item); | 1257 | break_cow(rmap_item); |
1187 | } | 1258 | } |
1188 | } | 1259 | } |