diff options
author | Hugh Dickins <hugh.dickins@tiscali.co.uk> | 2009-12-14 20:59:24 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-15 11:53:19 -0500 |
commit | 5ad6468801d28c4d4ac9f48ec19297817c915f6a (patch) | |
tree | edd8dc48693f43278d6fe1614aca2bf660d4dc10 /mm/ksm.c | |
parent | 73848b4684e84a84cfd1555af78d41158f31e16b (diff) |
ksm: let shared pages be swappable
Initial implementation for swapping out KSM's shared pages: add
page_referenced_ksm() and try_to_unmap_ksm(), which rmap.c calls when
faced with a PageKsm page.
Most of what's needed can be got from the rmap_items listed from the
stable_node of the ksm page, without discovering the actual vma: so in
this patch just fake up a struct vma for page_referenced_one() or
try_to_unmap_one(), then refine that in the next patch.
Add VM_NONLINEAR to ksm_madvise()'s list of exclusions: it has always been
implicit there (being only set with VM_SHARED, already excluded), but
let's make it explicit, to help justify the lack of nonlinear unmap.
Rely on the page lock to protect against concurrent modifications to that
page's node of the stable tree.
The awkward part is not swapout but swapin: do_swap_page() and
page_add_anon_rmap() now have to allow for new possibilities - perhaps a
ksm page still in swapcache, perhaps a swapcache page associated with one
location in one anon_vma now needed for another location or anon_vma.
(And the vma might even be no longer VM_MERGEABLE when that happens.)
ksm_might_need_to_copy() checks for that case, and supplies a duplicate
page when necessary, simply leaving it to a subsequent pass of ksmd to
rediscover the identity and merge them back into one ksm page.
Disappointingly primitive: but the alternative would have to accumulate
unswappable info about the swapped out ksm pages, limiting swappability.
Remove page_add_ksm_rmap(): page_add_anon_rmap() now has to allow for the
particular case it was handling, so just use it instead.
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Izik Eidus <ieidus@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Chris Wright <chrisw@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/ksm.c')
-rw-r--r-- | mm/ksm.c | 172 |
1 files changed, 157 insertions, 15 deletions
@@ -196,6 +196,13 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); | |||
196 | static DEFINE_MUTEX(ksm_thread_mutex); | 196 | static DEFINE_MUTEX(ksm_thread_mutex); |
197 | static DEFINE_SPINLOCK(ksm_mmlist_lock); | 197 | static DEFINE_SPINLOCK(ksm_mmlist_lock); |
198 | 198 | ||
199 | /* | ||
200 | * Temporary hack for page_referenced_ksm() and try_to_unmap_ksm(), | ||
201 | * later we rework things a little to get the right vma to them. | ||
202 | */ | ||
203 | static DEFINE_SPINLOCK(ksm_fallback_vma_lock); | ||
204 | static struct vm_area_struct ksm_fallback_vma; | ||
205 | |||
199 | #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ | 206 | #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ |
200 | sizeof(struct __struct), __alignof__(struct __struct),\ | 207 | sizeof(struct __struct), __alignof__(struct __struct),\ |
201 | (__flags), NULL) | 208 | (__flags), NULL) |
@@ -445,14 +452,20 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
445 | { | 452 | { |
446 | if (rmap_item->address & STABLE_FLAG) { | 453 | if (rmap_item->address & STABLE_FLAG) { |
447 | struct stable_node *stable_node; | 454 | struct stable_node *stable_node; |
455 | struct page *page; | ||
448 | 456 | ||
449 | stable_node = rmap_item->head; | 457 | stable_node = rmap_item->head; |
458 | page = stable_node->page; | ||
459 | lock_page(page); | ||
460 | |||
450 | hlist_del(&rmap_item->hlist); | 461 | hlist_del(&rmap_item->hlist); |
451 | if (stable_node->hlist.first) | 462 | if (stable_node->hlist.first) { |
463 | unlock_page(page); | ||
452 | ksm_pages_sharing--; | 464 | ksm_pages_sharing--; |
453 | else { | 465 | } else { |
454 | set_page_stable_node(stable_node->page, NULL); | 466 | set_page_stable_node(page, NULL); |
455 | put_page(stable_node->page); | 467 | unlock_page(page); |
468 | put_page(page); | ||
456 | 469 | ||
457 | rb_erase(&stable_node->node, &root_stable_tree); | 470 | rb_erase(&stable_node->node, &root_stable_tree); |
458 | free_stable_node(stable_node); | 471 | free_stable_node(stable_node); |
@@ -710,7 +723,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
710 | } | 723 | } |
711 | 724 | ||
712 | get_page(kpage); | 725 | get_page(kpage); |
713 | page_add_ksm_rmap(kpage); | 726 | page_add_anon_rmap(kpage, vma, addr); |
714 | 727 | ||
715 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | 728 | flush_cache_page(vma, addr, pte_pfn(*ptep)); |
716 | ptep_clear_flush(vma, addr, ptep); | 729 | ptep_clear_flush(vma, addr, ptep); |
@@ -763,8 +776,16 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
763 | pages_identical(page, kpage)) | 776 | pages_identical(page, kpage)) |
764 | err = replace_page(vma, page, kpage, orig_pte); | 777 | err = replace_page(vma, page, kpage, orig_pte); |
765 | 778 | ||
766 | if ((vma->vm_flags & VM_LOCKED) && !err) | 779 | if ((vma->vm_flags & VM_LOCKED) && !err) { |
767 | munlock_vma_page(page); | 780 | munlock_vma_page(page); |
781 | if (!PageMlocked(kpage)) { | ||
782 | unlock_page(page); | ||
783 | lru_add_drain(); | ||
784 | lock_page(kpage); | ||
785 | mlock_vma_page(kpage); | ||
786 | page = kpage; /* for final unlock */ | ||
787 | } | ||
788 | } | ||
768 | 789 | ||
769 | unlock_page(page); | 790 | unlock_page(page); |
770 | out: | 791 | out: |
@@ -841,7 +862,11 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, | |||
841 | 862 | ||
842 | copy_user_highpage(kpage, page, rmap_item->address, vma); | 863 | copy_user_highpage(kpage, page, rmap_item->address, vma); |
843 | 864 | ||
865 | SetPageDirty(kpage); | ||
866 | __SetPageUptodate(kpage); | ||
867 | SetPageSwapBacked(kpage); | ||
844 | set_page_stable_node(kpage, NULL); /* mark it PageKsm */ | 868 | set_page_stable_node(kpage, NULL); /* mark it PageKsm */ |
869 | lru_cache_add_lru(kpage, LRU_ACTIVE_ANON); | ||
845 | 870 | ||
846 | err = try_to_merge_one_page(vma, page, kpage); | 871 | err = try_to_merge_one_page(vma, page, kpage); |
847 | up: | 872 | up: |
@@ -1071,7 +1096,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1071 | * The page was successfully merged: | 1096 | * The page was successfully merged: |
1072 | * add its rmap_item to the stable tree. | 1097 | * add its rmap_item to the stable tree. |
1073 | */ | 1098 | */ |
1099 | lock_page(kpage); | ||
1074 | stable_tree_append(rmap_item, stable_node); | 1100 | stable_tree_append(rmap_item, stable_node); |
1101 | unlock_page(kpage); | ||
1075 | } | 1102 | } |
1076 | put_page(kpage); | 1103 | put_page(kpage); |
1077 | return; | 1104 | return; |
@@ -1112,11 +1139,13 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1112 | if (kpage) { | 1139 | if (kpage) { |
1113 | remove_rmap_item_from_tree(tree_rmap_item); | 1140 | remove_rmap_item_from_tree(tree_rmap_item); |
1114 | 1141 | ||
1142 | lock_page(kpage); | ||
1115 | stable_node = stable_tree_insert(kpage); | 1143 | stable_node = stable_tree_insert(kpage); |
1116 | if (stable_node) { | 1144 | if (stable_node) { |
1117 | stable_tree_append(tree_rmap_item, stable_node); | 1145 | stable_tree_append(tree_rmap_item, stable_node); |
1118 | stable_tree_append(rmap_item, stable_node); | 1146 | stable_tree_append(rmap_item, stable_node); |
1119 | } | 1147 | } |
1148 | unlock_page(kpage); | ||
1120 | put_page(kpage); | 1149 | put_page(kpage); |
1121 | 1150 | ||
1122 | /* | 1151 | /* |
@@ -1285,14 +1314,6 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
1285 | return; | 1314 | return; |
1286 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) | 1315 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) |
1287 | cmp_and_merge_page(page, rmap_item); | 1316 | cmp_and_merge_page(page, rmap_item); |
1288 | else if (page_mapcount(page) == 1) { | ||
1289 | /* | ||
1290 | * Replace now-unshared ksm page by ordinary page. | ||
1291 | */ | ||
1292 | break_cow(rmap_item); | ||
1293 | remove_rmap_item_from_tree(rmap_item); | ||
1294 | rmap_item->oldchecksum = calc_checksum(page); | ||
1295 | } | ||
1296 | put_page(page); | 1317 | put_page(page); |
1297 | } | 1318 | } |
1298 | } | 1319 | } |
@@ -1337,7 +1358,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
1337 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | | 1358 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | |
1338 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | 1359 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | |
1339 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | 1360 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | |
1340 | VM_MIXEDMAP | VM_SAO)) | 1361 | VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) |
1341 | return 0; /* just ignore the advice */ | 1362 | return 0; /* just ignore the advice */ |
1342 | 1363 | ||
1343 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { | 1364 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { |
@@ -1435,6 +1456,127 @@ void __ksm_exit(struct mm_struct *mm) | |||
1435 | } | 1456 | } |
1436 | } | 1457 | } |
1437 | 1458 | ||
1459 | struct page *ksm_does_need_to_copy(struct page *page, | ||
1460 | struct vm_area_struct *vma, unsigned long address) | ||
1461 | { | ||
1462 | struct page *new_page; | ||
1463 | |||
1464 | unlock_page(page); /* any racers will COW it, not modify it */ | ||
1465 | |||
1466 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
1467 | if (new_page) { | ||
1468 | copy_user_highpage(new_page, page, address, vma); | ||
1469 | |||
1470 | SetPageDirty(new_page); | ||
1471 | __SetPageUptodate(new_page); | ||
1472 | SetPageSwapBacked(new_page); | ||
1473 | __set_page_locked(new_page); | ||
1474 | |||
1475 | if (page_evictable(new_page, vma)) | ||
1476 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); | ||
1477 | else | ||
1478 | add_page_to_unevictable_list(new_page); | ||
1479 | } | ||
1480 | |||
1481 | page_cache_release(page); | ||
1482 | return new_page; | ||
1483 | } | ||
1484 | |||
1485 | int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | ||
1486 | unsigned long *vm_flags) | ||
1487 | { | ||
1488 | struct stable_node *stable_node; | ||
1489 | struct rmap_item *rmap_item; | ||
1490 | struct hlist_node *hlist; | ||
1491 | unsigned int mapcount = page_mapcount(page); | ||
1492 | int referenced = 0; | ||
1493 | struct vm_area_struct *vma; | ||
1494 | |||
1495 | VM_BUG_ON(!PageKsm(page)); | ||
1496 | VM_BUG_ON(!PageLocked(page)); | ||
1497 | |||
1498 | stable_node = page_stable_node(page); | ||
1499 | if (!stable_node) | ||
1500 | return 0; | ||
1501 | |||
1502 | /* | ||
1503 | * Temporary hack: really we need anon_vma in rmap_item, to | ||
1504 | * provide the correct vma, and to find recently forked instances. | ||
1505 | * Use zalloc to avoid weirdness if any other fields are involved. | ||
1506 | */ | ||
1507 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC); | ||
1508 | if (!vma) { | ||
1509 | spin_lock(&ksm_fallback_vma_lock); | ||
1510 | vma = &ksm_fallback_vma; | ||
1511 | } | ||
1512 | |||
1513 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
1514 | if (memcg && !mm_match_cgroup(rmap_item->mm, memcg)) | ||
1515 | continue; | ||
1516 | |||
1517 | vma->vm_mm = rmap_item->mm; | ||
1518 | vma->vm_start = rmap_item->address; | ||
1519 | vma->vm_end = vma->vm_start + PAGE_SIZE; | ||
1520 | |||
1521 | referenced += page_referenced_one(page, vma, | ||
1522 | rmap_item->address, &mapcount, vm_flags); | ||
1523 | if (!mapcount) | ||
1524 | goto out; | ||
1525 | } | ||
1526 | out: | ||
1527 | if (vma == &ksm_fallback_vma) | ||
1528 | spin_unlock(&ksm_fallback_vma_lock); | ||
1529 | else | ||
1530 | kmem_cache_free(vm_area_cachep, vma); | ||
1531 | return referenced; | ||
1532 | } | ||
1533 | |||
1534 | int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | ||
1535 | { | ||
1536 | struct stable_node *stable_node; | ||
1537 | struct hlist_node *hlist; | ||
1538 | struct rmap_item *rmap_item; | ||
1539 | int ret = SWAP_AGAIN; | ||
1540 | struct vm_area_struct *vma; | ||
1541 | |||
1542 | VM_BUG_ON(!PageKsm(page)); | ||
1543 | VM_BUG_ON(!PageLocked(page)); | ||
1544 | |||
1545 | stable_node = page_stable_node(page); | ||
1546 | if (!stable_node) | ||
1547 | return SWAP_FAIL; | ||
1548 | |||
1549 | /* | ||
1550 | * Temporary hack: really we need anon_vma in rmap_item, to | ||
1551 | * provide the correct vma, and to find recently forked instances. | ||
1552 | * Use zalloc to avoid weirdness if any other fields are involved. | ||
1553 | */ | ||
1554 | if (TTU_ACTION(flags) != TTU_UNMAP) | ||
1555 | return SWAP_FAIL; | ||
1556 | |||
1557 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC); | ||
1558 | if (!vma) { | ||
1559 | spin_lock(&ksm_fallback_vma_lock); | ||
1560 | vma = &ksm_fallback_vma; | ||
1561 | } | ||
1562 | |||
1563 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
1564 | vma->vm_mm = rmap_item->mm; | ||
1565 | vma->vm_start = rmap_item->address; | ||
1566 | vma->vm_end = vma->vm_start + PAGE_SIZE; | ||
1567 | |||
1568 | ret = try_to_unmap_one(page, vma, rmap_item->address, flags); | ||
1569 | if (ret != SWAP_AGAIN || !page_mapped(page)) | ||
1570 | goto out; | ||
1571 | } | ||
1572 | out: | ||
1573 | if (vma == &ksm_fallback_vma) | ||
1574 | spin_unlock(&ksm_fallback_vma_lock); | ||
1575 | else | ||
1576 | kmem_cache_free(vm_area_cachep, vma); | ||
1577 | return ret; | ||
1578 | } | ||
1579 | |||
1438 | #ifdef CONFIG_SYSFS | 1580 | #ifdef CONFIG_SYSFS |
1439 | /* | 1581 | /* |
1440 | * This all compiles without CONFIG_SYSFS, but is a waste of space. | 1582 | * This all compiles without CONFIG_SYSFS, but is a waste of space. |