diff options
Diffstat (limited to 'mm/ksm.c')
-rw-r--r-- | mm/ksm.c | 126 |
1 files changed, 98 insertions, 28 deletions
@@ -34,6 +34,8 @@ | |||
34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
35 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
36 | #include <linux/hash.h> | 36 | #include <linux/hash.h> |
37 | #include <linux/freezer.h> | ||
38 | #include <linux/oom.h> | ||
37 | 39 | ||
38 | #include <asm/tlbflush.h> | 40 | #include <asm/tlbflush.h> |
39 | #include "internal.h" | 41 | #include "internal.h" |
@@ -300,20 +302,6 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) | |||
300 | return rmap_item->address & STABLE_FLAG; | 302 | return rmap_item->address & STABLE_FLAG; |
301 | } | 303 | } |
302 | 304 | ||
303 | static void hold_anon_vma(struct rmap_item *rmap_item, | ||
304 | struct anon_vma *anon_vma) | ||
305 | { | ||
306 | rmap_item->anon_vma = anon_vma; | ||
307 | get_anon_vma(anon_vma); | ||
308 | } | ||
309 | |||
310 | static void ksm_drop_anon_vma(struct rmap_item *rmap_item) | ||
311 | { | ||
312 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
313 | |||
314 | drop_anon_vma(anon_vma); | ||
315 | } | ||
316 | |||
317 | /* | 305 | /* |
318 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's | 306 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's |
319 | * page tables after it has passed through ksm_exit() - which, if necessary, | 307 | * page tables after it has passed through ksm_exit() - which, if necessary, |
@@ -396,7 +384,7 @@ static void break_cow(struct rmap_item *rmap_item) | |||
396 | * It is not an accident that whenever we want to break COW | 384 | * It is not an accident that whenever we want to break COW |
397 | * to undo, we also need to drop a reference to the anon_vma. | 385 | * to undo, we also need to drop a reference to the anon_vma. |
398 | */ | 386 | */ |
399 | ksm_drop_anon_vma(rmap_item); | 387 | put_anon_vma(rmap_item->anon_vma); |
400 | 388 | ||
401 | down_read(&mm->mmap_sem); | 389 | down_read(&mm->mmap_sem); |
402 | if (ksm_test_exit(mm)) | 390 | if (ksm_test_exit(mm)) |
@@ -411,6 +399,20 @@ out: | |||
411 | up_read(&mm->mmap_sem); | 399 | up_read(&mm->mmap_sem); |
412 | } | 400 | } |
413 | 401 | ||
402 | static struct page *page_trans_compound_anon(struct page *page) | ||
403 | { | ||
404 | if (PageTransCompound(page)) { | ||
405 | struct page *head = compound_trans_head(page); | ||
406 | /* | ||
407 | * head may actually be splitted and freed from under | ||
408 | * us but it's ok here. | ||
409 | */ | ||
410 | if (PageAnon(head)) | ||
411 | return head; | ||
412 | } | ||
413 | return NULL; | ||
414 | } | ||
415 | |||
414 | static struct page *get_mergeable_page(struct rmap_item *rmap_item) | 416 | static struct page *get_mergeable_page(struct rmap_item *rmap_item) |
415 | { | 417 | { |
416 | struct mm_struct *mm = rmap_item->mm; | 418 | struct mm_struct *mm = rmap_item->mm; |
@@ -430,7 +432,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) | |||
430 | page = follow_page(vma, addr, FOLL_GET); | 432 | page = follow_page(vma, addr, FOLL_GET); |
431 | if (IS_ERR_OR_NULL(page)) | 433 | if (IS_ERR_OR_NULL(page)) |
432 | goto out; | 434 | goto out; |
433 | if (PageAnon(page)) { | 435 | if (PageAnon(page) || page_trans_compound_anon(page)) { |
434 | flush_anon_page(vma, page, addr); | 436 | flush_anon_page(vma, page, addr); |
435 | flush_dcache_page(page); | 437 | flush_dcache_page(page); |
436 | } else { | 438 | } else { |
@@ -451,7 +453,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
451 | ksm_pages_sharing--; | 453 | ksm_pages_sharing--; |
452 | else | 454 | else |
453 | ksm_pages_shared--; | 455 | ksm_pages_shared--; |
454 | ksm_drop_anon_vma(rmap_item); | 456 | put_anon_vma(rmap_item->anon_vma); |
455 | rmap_item->address &= PAGE_MASK; | 457 | rmap_item->address &= PAGE_MASK; |
456 | cond_resched(); | 458 | cond_resched(); |
457 | } | 459 | } |
@@ -539,7 +541,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
539 | else | 541 | else |
540 | ksm_pages_shared--; | 542 | ksm_pages_shared--; |
541 | 543 | ||
542 | ksm_drop_anon_vma(rmap_item); | 544 | put_anon_vma(rmap_item->anon_vma); |
543 | rmap_item->address &= PAGE_MASK; | 545 | rmap_item->address &= PAGE_MASK; |
544 | 546 | ||
545 | } else if (rmap_item->address & UNSTABLE_FLAG) { | 547 | } else if (rmap_item->address & UNSTABLE_FLAG) { |
@@ -708,6 +710,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
708 | if (addr == -EFAULT) | 710 | if (addr == -EFAULT) |
709 | goto out; | 711 | goto out; |
710 | 712 | ||
713 | BUG_ON(PageTransCompound(page)); | ||
711 | ptep = page_check_address(page, mm, addr, &ptl, 0); | 714 | ptep = page_check_address(page, mm, addr, &ptl, 0); |
712 | if (!ptep) | 715 | if (!ptep) |
713 | goto out; | 716 | goto out; |
@@ -718,7 +721,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
718 | swapped = PageSwapCache(page); | 721 | swapped = PageSwapCache(page); |
719 | flush_cache_page(vma, addr, page_to_pfn(page)); | 722 | flush_cache_page(vma, addr, page_to_pfn(page)); |
720 | /* | 723 | /* |
721 | * Ok this is tricky, when get_user_pages_fast() run it doesnt | 724 | * Ok this is tricky, when get_user_pages_fast() run it doesn't |
722 | * take any lock, therefore the check that we are going to make | 725 | * take any lock, therefore the check that we are going to make |
723 | * with the pagecount against the mapcount is racey and | 726 | * with the pagecount against the mapcount is racey and |
724 | * O_DIRECT can happen right after the check. | 727 | * O_DIRECT can happen right after the check. |
@@ -783,6 +786,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
783 | goto out; | 786 | goto out; |
784 | 787 | ||
785 | pmd = pmd_offset(pud, addr); | 788 | pmd = pmd_offset(pud, addr); |
789 | BUG_ON(pmd_trans_huge(*pmd)); | ||
786 | if (!pmd_present(*pmd)) | 790 | if (!pmd_present(*pmd)) |
787 | goto out; | 791 | goto out; |
788 | 792 | ||
@@ -800,6 +804,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
800 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); | 804 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); |
801 | 805 | ||
802 | page_remove_rmap(page); | 806 | page_remove_rmap(page); |
807 | if (!page_mapped(page)) | ||
808 | try_to_free_swap(page); | ||
803 | put_page(page); | 809 | put_page(page); |
804 | 810 | ||
805 | pte_unmap_unlock(ptep, ptl); | 811 | pte_unmap_unlock(ptep, ptl); |
@@ -808,6 +814,33 @@ out: | |||
808 | return err; | 814 | return err; |
809 | } | 815 | } |
810 | 816 | ||
817 | static int page_trans_compound_anon_split(struct page *page) | ||
818 | { | ||
819 | int ret = 0; | ||
820 | struct page *transhuge_head = page_trans_compound_anon(page); | ||
821 | if (transhuge_head) { | ||
822 | /* Get the reference on the head to split it. */ | ||
823 | if (get_page_unless_zero(transhuge_head)) { | ||
824 | /* | ||
825 | * Recheck we got the reference while the head | ||
826 | * was still anonymous. | ||
827 | */ | ||
828 | if (PageAnon(transhuge_head)) | ||
829 | ret = split_huge_page(transhuge_head); | ||
830 | else | ||
831 | /* | ||
832 | * Retry later if split_huge_page run | ||
833 | * from under us. | ||
834 | */ | ||
835 | ret = 1; | ||
836 | put_page(transhuge_head); | ||
837 | } else | ||
838 | /* Retry later if split_huge_page run from under us. */ | ||
839 | ret = 1; | ||
840 | } | ||
841 | return ret; | ||
842 | } | ||
843 | |||
811 | /* | 844 | /* |
812 | * try_to_merge_one_page - take two pages and merge them into one | 845 | * try_to_merge_one_page - take two pages and merge them into one |
813 | * @vma: the vma that holds the pte pointing to page | 846 | * @vma: the vma that holds the pte pointing to page |
@@ -828,6 +861,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
828 | 861 | ||
829 | if (!(vma->vm_flags & VM_MERGEABLE)) | 862 | if (!(vma->vm_flags & VM_MERGEABLE)) |
830 | goto out; | 863 | goto out; |
864 | if (PageTransCompound(page) && page_trans_compound_anon_split(page)) | ||
865 | goto out; | ||
866 | BUG_ON(PageTransCompound(page)); | ||
831 | if (!PageAnon(page)) | 867 | if (!PageAnon(page)) |
832 | goto out; | 868 | goto out; |
833 | 869 | ||
@@ -900,7 +936,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, | |||
900 | goto out; | 936 | goto out; |
901 | 937 | ||
902 | /* Must get reference to anon_vma while still holding mmap_sem */ | 938 | /* Must get reference to anon_vma while still holding mmap_sem */ |
903 | hold_anon_vma(rmap_item, vma->anon_vma); | 939 | rmap_item->anon_vma = vma->anon_vma; |
940 | get_anon_vma(vma->anon_vma); | ||
904 | out: | 941 | out: |
905 | up_read(&mm->mmap_sem); | 942 | up_read(&mm->mmap_sem); |
906 | return err; | 943 | return err; |
@@ -1247,12 +1284,30 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1247 | 1284 | ||
1248 | slot = ksm_scan.mm_slot; | 1285 | slot = ksm_scan.mm_slot; |
1249 | if (slot == &ksm_mm_head) { | 1286 | if (slot == &ksm_mm_head) { |
1287 | /* | ||
1288 | * A number of pages can hang around indefinitely on per-cpu | ||
1289 | * pagevecs, raised page count preventing write_protect_page | ||
1290 | * from merging them. Though it doesn't really matter much, | ||
1291 | * it is puzzling to see some stuck in pages_volatile until | ||
1292 | * other activity jostles them out, and they also prevented | ||
1293 | * LTP's KSM test from succeeding deterministically; so drain | ||
1294 | * them here (here rather than on entry to ksm_do_scan(), | ||
1295 | * so we don't IPI too often when pages_to_scan is set low). | ||
1296 | */ | ||
1297 | lru_add_drain_all(); | ||
1298 | |||
1250 | root_unstable_tree = RB_ROOT; | 1299 | root_unstable_tree = RB_ROOT; |
1251 | 1300 | ||
1252 | spin_lock(&ksm_mmlist_lock); | 1301 | spin_lock(&ksm_mmlist_lock); |
1253 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); | 1302 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); |
1254 | ksm_scan.mm_slot = slot; | 1303 | ksm_scan.mm_slot = slot; |
1255 | spin_unlock(&ksm_mmlist_lock); | 1304 | spin_unlock(&ksm_mmlist_lock); |
1305 | /* | ||
1306 | * Although we tested list_empty() above, a racing __ksm_exit | ||
1307 | * of the last mm on the list may have removed it since then. | ||
1308 | */ | ||
1309 | if (slot == &ksm_mm_head) | ||
1310 | return NULL; | ||
1256 | next_mm: | 1311 | next_mm: |
1257 | ksm_scan.address = 0; | 1312 | ksm_scan.address = 0; |
1258 | ksm_scan.rmap_list = &slot->rmap_list; | 1313 | ksm_scan.rmap_list = &slot->rmap_list; |
@@ -1277,7 +1332,13 @@ next_mm: | |||
1277 | if (ksm_test_exit(mm)) | 1332 | if (ksm_test_exit(mm)) |
1278 | break; | 1333 | break; |
1279 | *page = follow_page(vma, ksm_scan.address, FOLL_GET); | 1334 | *page = follow_page(vma, ksm_scan.address, FOLL_GET); |
1280 | if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { | 1335 | if (IS_ERR_OR_NULL(*page)) { |
1336 | ksm_scan.address += PAGE_SIZE; | ||
1337 | cond_resched(); | ||
1338 | continue; | ||
1339 | } | ||
1340 | if (PageAnon(*page) || | ||
1341 | page_trans_compound_anon(*page)) { | ||
1281 | flush_anon_page(vma, *page, ksm_scan.address); | 1342 | flush_anon_page(vma, *page, ksm_scan.address); |
1282 | flush_dcache_page(*page); | 1343 | flush_dcache_page(*page); |
1283 | rmap_item = get_next_rmap_item(slot, | 1344 | rmap_item = get_next_rmap_item(slot, |
@@ -1291,8 +1352,7 @@ next_mm: | |||
1291 | up_read(&mm->mmap_sem); | 1352 | up_read(&mm->mmap_sem); |
1292 | return rmap_item; | 1353 | return rmap_item; |
1293 | } | 1354 | } |
1294 | if (!IS_ERR_OR_NULL(*page)) | 1355 | put_page(*page); |
1295 | put_page(*page); | ||
1296 | ksm_scan.address += PAGE_SIZE; | 1356 | ksm_scan.address += PAGE_SIZE; |
1297 | cond_resched(); | 1357 | cond_resched(); |
1298 | } | 1358 | } |
@@ -1352,7 +1412,7 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
1352 | struct rmap_item *rmap_item; | 1412 | struct rmap_item *rmap_item; |
1353 | struct page *uninitialized_var(page); | 1413 | struct page *uninitialized_var(page); |
1354 | 1414 | ||
1355 | while (scan_npages--) { | 1415 | while (scan_npages-- && likely(!freezing(current))) { |
1356 | cond_resched(); | 1416 | cond_resched(); |
1357 | rmap_item = scan_get_next_rmap_item(&page); | 1417 | rmap_item = scan_get_next_rmap_item(&page); |
1358 | if (!rmap_item) | 1418 | if (!rmap_item) |
@@ -1370,6 +1430,7 @@ static int ksmd_should_run(void) | |||
1370 | 1430 | ||
1371 | static int ksm_scan_thread(void *nothing) | 1431 | static int ksm_scan_thread(void *nothing) |
1372 | { | 1432 | { |
1433 | set_freezable(); | ||
1373 | set_user_nice(current, 5); | 1434 | set_user_nice(current, 5); |
1374 | 1435 | ||
1375 | while (!kthread_should_stop()) { | 1436 | while (!kthread_should_stop()) { |
@@ -1378,11 +1439,13 @@ static int ksm_scan_thread(void *nothing) | |||
1378 | ksm_do_scan(ksm_thread_pages_to_scan); | 1439 | ksm_do_scan(ksm_thread_pages_to_scan); |
1379 | mutex_unlock(&ksm_thread_mutex); | 1440 | mutex_unlock(&ksm_thread_mutex); |
1380 | 1441 | ||
1442 | try_to_freeze(); | ||
1443 | |||
1381 | if (ksmd_should_run()) { | 1444 | if (ksmd_should_run()) { |
1382 | schedule_timeout_interruptible( | 1445 | schedule_timeout_interruptible( |
1383 | msecs_to_jiffies(ksm_thread_sleep_millisecs)); | 1446 | msecs_to_jiffies(ksm_thread_sleep_millisecs)); |
1384 | } else { | 1447 | } else { |
1385 | wait_event_interruptible(ksm_thread_wait, | 1448 | wait_event_freezable(ksm_thread_wait, |
1386 | ksmd_should_run() || kthread_should_stop()); | 1449 | ksmd_should_run() || kthread_should_stop()); |
1387 | } | 1450 | } |
1388 | } | 1451 | } |
@@ -1724,8 +1787,13 @@ static int ksm_memory_callback(struct notifier_block *self, | |||
1724 | /* | 1787 | /* |
1725 | * Keep it very simple for now: just lock out ksmd and | 1788 | * Keep it very simple for now: just lock out ksmd and |
1726 | * MADV_UNMERGEABLE while any memory is going offline. | 1789 | * MADV_UNMERGEABLE while any memory is going offline. |
1790 | * mutex_lock_nested() is necessary because lockdep was alarmed | ||
1791 | * that here we take ksm_thread_mutex inside notifier chain | ||
1792 | * mutex, and later take notifier chain mutex inside | ||
1793 | * ksm_thread_mutex to unlock it. But that's safe because both | ||
1794 | * are inside mem_hotplug_mutex. | ||
1727 | */ | 1795 | */ |
1728 | mutex_lock(&ksm_thread_mutex); | 1796 | mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); |
1729 | break; | 1797 | break; |
1730 | 1798 | ||
1731 | case MEM_OFFLINE: | 1799 | case MEM_OFFLINE: |
@@ -1833,9 +1901,11 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1833 | if (ksm_run != flags) { | 1901 | if (ksm_run != flags) { |
1834 | ksm_run = flags; | 1902 | ksm_run = flags; |
1835 | if (flags & KSM_RUN_UNMERGE) { | 1903 | if (flags & KSM_RUN_UNMERGE) { |
1836 | current->flags |= PF_OOM_ORIGIN; | 1904 | int oom_score_adj; |
1905 | |||
1906 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | ||
1837 | err = unmerge_and_remove_all_rmap_items(); | 1907 | err = unmerge_and_remove_all_rmap_items(); |
1838 | current->flags &= ~PF_OOM_ORIGIN; | 1908 | test_set_oom_score_adj(oom_score_adj); |
1839 | if (err) { | 1909 | if (err) { |
1840 | ksm_run = KSM_RUN_STOP; | 1910 | ksm_run = KSM_RUN_STOP; |
1841 | count = err; | 1911 | count = err; |