diff options
Diffstat (limited to 'mm/ksm.c')
-rw-r--r-- | mm/ksm.c | 657 |
1 files changed, 529 insertions, 128 deletions
@@ -33,13 +33,22 @@ | |||
33 | #include <linux/mmu_notifier.h> | 33 | #include <linux/mmu_notifier.h> |
34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
35 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
36 | #include <linux/hash.h> | 36 | #include <linux/hashtable.h> |
37 | #include <linux/freezer.h> | 37 | #include <linux/freezer.h> |
38 | #include <linux/oom.h> | 38 | #include <linux/oom.h> |
39 | #include <linux/numa.h> | ||
39 | 40 | ||
40 | #include <asm/tlbflush.h> | 41 | #include <asm/tlbflush.h> |
41 | #include "internal.h" | 42 | #include "internal.h" |
42 | 43 | ||
44 | #ifdef CONFIG_NUMA | ||
45 | #define NUMA(x) (x) | ||
46 | #define DO_NUMA(x) do { (x); } while (0) | ||
47 | #else | ||
48 | #define NUMA(x) (0) | ||
49 | #define DO_NUMA(x) do { } while (0) | ||
50 | #endif | ||
51 | |||
43 | /* | 52 | /* |
44 | * A few notes about the KSM scanning process, | 53 | * A few notes about the KSM scanning process, |
45 | * to make it easier to understand the data structures below: | 54 | * to make it easier to understand the data structures below: |
@@ -78,6 +87,9 @@ | |||
78 | * take 10 attempts to find a page in the unstable tree, once it is found, | 87 | * take 10 attempts to find a page in the unstable tree, once it is found, |
79 | * it is secured in the stable tree. (When we scan a new page, we first | 88 | * it is secured in the stable tree. (When we scan a new page, we first |
80 | * compare it against the stable tree, and then against the unstable tree.) | 89 | * compare it against the stable tree, and then against the unstable tree.) |
90 | * | ||
91 | * If the merge_across_nodes tunable is unset, then KSM maintains multiple | ||
92 | * stable trees and multiple unstable trees: one of each for each NUMA node. | ||
81 | */ | 93 | */ |
82 | 94 | ||
83 | /** | 95 | /** |
@@ -113,19 +125,32 @@ struct ksm_scan { | |||
113 | /** | 125 | /** |
114 | * struct stable_node - node of the stable rbtree | 126 | * struct stable_node - node of the stable rbtree |
115 | * @node: rb node of this ksm page in the stable tree | 127 | * @node: rb node of this ksm page in the stable tree |
128 | * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list | ||
129 | * @list: linked into migrate_nodes, pending placement in the proper node tree | ||
116 | * @hlist: hlist head of rmap_items using this ksm page | 130 | * @hlist: hlist head of rmap_items using this ksm page |
117 | * @kpfn: page frame number of this ksm page | 131 | * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) |
132 | * @nid: NUMA node id of stable tree in which linked (may not match kpfn) | ||
118 | */ | 133 | */ |
119 | struct stable_node { | 134 | struct stable_node { |
120 | struct rb_node node; | 135 | union { |
136 | struct rb_node node; /* when node of stable tree */ | ||
137 | struct { /* when listed for migration */ | ||
138 | struct list_head *head; | ||
139 | struct list_head list; | ||
140 | }; | ||
141 | }; | ||
121 | struct hlist_head hlist; | 142 | struct hlist_head hlist; |
122 | unsigned long kpfn; | 143 | unsigned long kpfn; |
144 | #ifdef CONFIG_NUMA | ||
145 | int nid; | ||
146 | #endif | ||
123 | }; | 147 | }; |
124 | 148 | ||
125 | /** | 149 | /** |
126 | * struct rmap_item - reverse mapping item for virtual addresses | 150 | * struct rmap_item - reverse mapping item for virtual addresses |
127 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list | 151 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list |
128 | * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree | 152 | * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree |
153 | * @nid: NUMA node id of unstable tree in which linked (may not match page) | ||
129 | * @mm: the memory structure this rmap_item is pointing into | 154 | * @mm: the memory structure this rmap_item is pointing into |
130 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) | 155 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) |
131 | * @oldchecksum: previous checksum of the page at that virtual address | 156 | * @oldchecksum: previous checksum of the page at that virtual address |
@@ -135,7 +160,12 @@ struct stable_node { | |||
135 | */ | 160 | */ |
136 | struct rmap_item { | 161 | struct rmap_item { |
137 | struct rmap_item *rmap_list; | 162 | struct rmap_item *rmap_list; |
138 | struct anon_vma *anon_vma; /* when stable */ | 163 | union { |
164 | struct anon_vma *anon_vma; /* when stable */ | ||
165 | #ifdef CONFIG_NUMA | ||
166 | int nid; /* when node of unstable tree */ | ||
167 | #endif | ||
168 | }; | ||
139 | struct mm_struct *mm; | 169 | struct mm_struct *mm; |
140 | unsigned long address; /* + low bits used for flags below */ | 170 | unsigned long address; /* + low bits used for flags below */ |
141 | unsigned int oldchecksum; /* when unstable */ | 171 | unsigned int oldchecksum; /* when unstable */ |
@@ -153,12 +183,16 @@ struct rmap_item { | |||
153 | #define STABLE_FLAG 0x200 /* is listed from the stable tree */ | 183 | #define STABLE_FLAG 0x200 /* is listed from the stable tree */ |
154 | 184 | ||
155 | /* The stable and unstable tree heads */ | 185 | /* The stable and unstable tree heads */ |
156 | static struct rb_root root_stable_tree = RB_ROOT; | 186 | static struct rb_root one_stable_tree[1] = { RB_ROOT }; |
157 | static struct rb_root root_unstable_tree = RB_ROOT; | 187 | static struct rb_root one_unstable_tree[1] = { RB_ROOT }; |
188 | static struct rb_root *root_stable_tree = one_stable_tree; | ||
189 | static struct rb_root *root_unstable_tree = one_unstable_tree; | ||
158 | 190 | ||
159 | #define MM_SLOTS_HASH_SHIFT 10 | 191 | /* Recently migrated nodes of stable tree, pending proper placement */ |
160 | #define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) | 192 | static LIST_HEAD(migrate_nodes); |
161 | static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; | 193 | |
194 | #define MM_SLOTS_HASH_BITS 10 | ||
195 | static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); | ||
162 | 196 | ||
163 | static struct mm_slot ksm_mm_head = { | 197 | static struct mm_slot ksm_mm_head = { |
164 | .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), | 198 | .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), |
@@ -189,10 +223,21 @@ static unsigned int ksm_thread_pages_to_scan = 100; | |||
189 | /* Milliseconds ksmd should sleep between batches */ | 223 | /* Milliseconds ksmd should sleep between batches */ |
190 | static unsigned int ksm_thread_sleep_millisecs = 20; | 224 | static unsigned int ksm_thread_sleep_millisecs = 20; |
191 | 225 | ||
226 | #ifdef CONFIG_NUMA | ||
227 | /* Zeroed when merging across nodes is not allowed */ | ||
228 | static unsigned int ksm_merge_across_nodes = 1; | ||
229 | static int ksm_nr_node_ids = 1; | ||
230 | #else | ||
231 | #define ksm_merge_across_nodes 1U | ||
232 | #define ksm_nr_node_ids 1 | ||
233 | #endif | ||
234 | |||
192 | #define KSM_RUN_STOP 0 | 235 | #define KSM_RUN_STOP 0 |
193 | #define KSM_RUN_MERGE 1 | 236 | #define KSM_RUN_MERGE 1 |
194 | #define KSM_RUN_UNMERGE 2 | 237 | #define KSM_RUN_UNMERGE 2 |
195 | static unsigned int ksm_run = KSM_RUN_STOP; | 238 | #define KSM_RUN_OFFLINE 4 |
239 | static unsigned long ksm_run = KSM_RUN_STOP; | ||
240 | static void wait_while_offlining(void); | ||
196 | 241 | ||
197 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); | 242 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); |
198 | static DEFINE_MUTEX(ksm_thread_mutex); | 243 | static DEFINE_MUTEX(ksm_thread_mutex); |
@@ -275,31 +320,21 @@ static inline void free_mm_slot(struct mm_slot *mm_slot) | |||
275 | 320 | ||
276 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | 321 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) |
277 | { | 322 | { |
278 | struct mm_slot *mm_slot; | ||
279 | struct hlist_head *bucket; | ||
280 | struct hlist_node *node; | 323 | struct hlist_node *node; |
324 | struct mm_slot *slot; | ||
325 | |||
326 | hash_for_each_possible(mm_slots_hash, slot, node, link, (unsigned long)mm) | ||
327 | if (slot->mm == mm) | ||
328 | return slot; | ||
281 | 329 | ||
282 | bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; | ||
283 | hlist_for_each_entry(mm_slot, node, bucket, link) { | ||
284 | if (mm == mm_slot->mm) | ||
285 | return mm_slot; | ||
286 | } | ||
287 | return NULL; | 330 | return NULL; |
288 | } | 331 | } |
289 | 332 | ||
290 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | 333 | static void insert_to_mm_slots_hash(struct mm_struct *mm, |
291 | struct mm_slot *mm_slot) | 334 | struct mm_slot *mm_slot) |
292 | { | 335 | { |
293 | struct hlist_head *bucket; | ||
294 | |||
295 | bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; | ||
296 | mm_slot->mm = mm; | 336 | mm_slot->mm = mm; |
297 | hlist_add_head(&mm_slot->link, bucket); | 337 | hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); |
298 | } | ||
299 | |||
300 | static inline int in_stable_tree(struct rmap_item *rmap_item) | ||
301 | { | ||
302 | return rmap_item->address & STABLE_FLAG; | ||
303 | } | 338 | } |
304 | 339 | ||
305 | /* | 340 | /* |
@@ -333,7 +368,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
333 | 368 | ||
334 | do { | 369 | do { |
335 | cond_resched(); | 370 | cond_resched(); |
336 | page = follow_page(vma, addr, FOLL_GET); | 371 | page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION); |
337 | if (IS_ERR_OR_NULL(page)) | 372 | if (IS_ERR_OR_NULL(page)) |
338 | break; | 373 | break; |
339 | if (PageKsm(page)) | 374 | if (PageKsm(page)) |
@@ -447,6 +482,17 @@ out: page = NULL; | |||
447 | return page; | 482 | return page; |
448 | } | 483 | } |
449 | 484 | ||
485 | /* | ||
486 | * This helper is used for getting right index into array of tree roots. | ||
487 | * When merge_across_nodes knob is set to 1, there are only two rb-trees for | ||
488 | * stable and unstable pages from all nodes with roots in index 0. Otherwise, | ||
489 | * every node has its own stable and unstable tree. | ||
490 | */ | ||
491 | static inline int get_kpfn_nid(unsigned long kpfn) | ||
492 | { | ||
493 | return ksm_merge_across_nodes ? 0 : pfn_to_nid(kpfn); | ||
494 | } | ||
495 | |||
450 | static void remove_node_from_stable_tree(struct stable_node *stable_node) | 496 | static void remove_node_from_stable_tree(struct stable_node *stable_node) |
451 | { | 497 | { |
452 | struct rmap_item *rmap_item; | 498 | struct rmap_item *rmap_item; |
@@ -462,7 +508,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
462 | cond_resched(); | 508 | cond_resched(); |
463 | } | 509 | } |
464 | 510 | ||
465 | rb_erase(&stable_node->node, &root_stable_tree); | 511 | if (stable_node->head == &migrate_nodes) |
512 | list_del(&stable_node->list); | ||
513 | else | ||
514 | rb_erase(&stable_node->node, | ||
515 | root_stable_tree + NUMA(stable_node->nid)); | ||
466 | free_stable_node(stable_node); | 516 | free_stable_node(stable_node); |
467 | } | 517 | } |
468 | 518 | ||
@@ -472,6 +522,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
472 | * In which case we can trust the content of the page, and it | 522 | * In which case we can trust the content of the page, and it |
473 | * returns the gotten page; but if the page has now been zapped, | 523 | * returns the gotten page; but if the page has now been zapped, |
474 | * remove the stale node from the stable tree and return NULL. | 524 | * remove the stale node from the stable tree and return NULL. |
525 | * But beware, the stable node's page might be being migrated. | ||
475 | * | 526 | * |
476 | * You would expect the stable_node to hold a reference to the ksm page. | 527 | * You would expect the stable_node to hold a reference to the ksm page. |
477 | * But if it increments the page's count, swapping out has to wait for | 528 | * But if it increments the page's count, swapping out has to wait for |
@@ -482,40 +533,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
482 | * pointing back to this stable node. This relies on freeing a PageAnon | 533 | * pointing back to this stable node. This relies on freeing a PageAnon |
483 | * page to reset its page->mapping to NULL, and relies on no other use of | 534 | * page to reset its page->mapping to NULL, and relies on no other use of |
484 | * a page to put something that might look like our key in page->mapping. | 535 | * a page to put something that might look like our key in page->mapping. |
485 | * | ||
486 | * include/linux/pagemap.h page_cache_get_speculative() is a good reference, | ||
487 | * but this is different - made simpler by ksm_thread_mutex being held, but | ||
488 | * interesting for assuming that no other use of the struct page could ever | ||
489 | * put our expected_mapping into page->mapping (or a field of the union which | ||
490 | * coincides with page->mapping). The RCU calls are not for KSM at all, but | ||
491 | * to keep the page_count protocol described with page_cache_get_speculative. | ||
492 | * | ||
493 | * Note: it is possible that get_ksm_page() will return NULL one moment, | ||
494 | * then page the next, if the page is in between page_freeze_refs() and | ||
495 | * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page | ||
496 | * is on its way to being freed; but it is an anomaly to bear in mind. | 536 | * is on its way to being freed; but it is an anomaly to bear in mind. |
497 | */ | 537 | */ |
498 | static struct page *get_ksm_page(struct stable_node *stable_node) | 538 | static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) |
499 | { | 539 | { |
500 | struct page *page; | 540 | struct page *page; |
501 | void *expected_mapping; | 541 | void *expected_mapping; |
542 | unsigned long kpfn; | ||
502 | 543 | ||
503 | page = pfn_to_page(stable_node->kpfn); | ||
504 | expected_mapping = (void *)stable_node + | 544 | expected_mapping = (void *)stable_node + |
505 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | 545 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); |
506 | rcu_read_lock(); | 546 | again: |
507 | if (page->mapping != expected_mapping) | 547 | kpfn = ACCESS_ONCE(stable_node->kpfn); |
508 | goto stale; | 548 | page = pfn_to_page(kpfn); |
509 | if (!get_page_unless_zero(page)) | 549 | |
550 | /* | ||
551 | * page is computed from kpfn, so on most architectures reading | ||
552 | * page->mapping is naturally ordered after reading node->kpfn, | ||
553 | * but on Alpha we need to be more careful. | ||
554 | */ | ||
555 | smp_read_barrier_depends(); | ||
556 | if (ACCESS_ONCE(page->mapping) != expected_mapping) | ||
510 | goto stale; | 557 | goto stale; |
511 | if (page->mapping != expected_mapping) { | 558 | |
559 | /* | ||
560 | * We cannot do anything with the page while its refcount is 0. | ||
561 | * Usually 0 means free, or tail of a higher-order page: in which | ||
562 | * case this node is no longer referenced, and should be freed; | ||
563 | * however, it might mean that the page is under page_freeze_refs(). | ||
564 | * The __remove_mapping() case is easy, again the node is now stale; | ||
565 | * but if page is swapcache in migrate_page_move_mapping(), it might | ||
566 | * still be our page, in which case it's essential to keep the node. | ||
567 | */ | ||
568 | while (!get_page_unless_zero(page)) { | ||
569 | /* | ||
570 | * Another check for page->mapping != expected_mapping would | ||
571 | * work here too. We have chosen the !PageSwapCache test to | ||
572 | * optimize the common case, when the page is or is about to | ||
573 | * be freed: PageSwapCache is cleared (under spin_lock_irq) | ||
574 | * in the freeze_refs section of __remove_mapping(); but Anon | ||
575 | * page->mapping reset to NULL later, in free_pages_prepare(). | ||
576 | */ | ||
577 | if (!PageSwapCache(page)) | ||
578 | goto stale; | ||
579 | cpu_relax(); | ||
580 | } | ||
581 | |||
582 | if (ACCESS_ONCE(page->mapping) != expected_mapping) { | ||
512 | put_page(page); | 583 | put_page(page); |
513 | goto stale; | 584 | goto stale; |
514 | } | 585 | } |
515 | rcu_read_unlock(); | 586 | |
587 | if (lock_it) { | ||
588 | lock_page(page); | ||
589 | if (ACCESS_ONCE(page->mapping) != expected_mapping) { | ||
590 | unlock_page(page); | ||
591 | put_page(page); | ||
592 | goto stale; | ||
593 | } | ||
594 | } | ||
516 | return page; | 595 | return page; |
596 | |||
517 | stale: | 597 | stale: |
518 | rcu_read_unlock(); | 598 | /* |
599 | * We come here from above when page->mapping or !PageSwapCache | ||
600 | * suggests that the node is stale; but it might be under migration. | ||
601 | * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), | ||
602 | * before checking whether node->kpfn has been changed. | ||
603 | */ | ||
604 | smp_rmb(); | ||
605 | if (ACCESS_ONCE(stable_node->kpfn) != kpfn) | ||
606 | goto again; | ||
519 | remove_node_from_stable_tree(stable_node); | 607 | remove_node_from_stable_tree(stable_node); |
520 | return NULL; | 608 | return NULL; |
521 | } | 609 | } |
@@ -531,11 +619,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
531 | struct page *page; | 619 | struct page *page; |
532 | 620 | ||
533 | stable_node = rmap_item->head; | 621 | stable_node = rmap_item->head; |
534 | page = get_ksm_page(stable_node); | 622 | page = get_ksm_page(stable_node, true); |
535 | if (!page) | 623 | if (!page) |
536 | goto out; | 624 | goto out; |
537 | 625 | ||
538 | lock_page(page); | ||
539 | hlist_del(&rmap_item->hlist); | 626 | hlist_del(&rmap_item->hlist); |
540 | unlock_page(page); | 627 | unlock_page(page); |
541 | put_page(page); | 628 | put_page(page); |
@@ -560,8 +647,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
560 | age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); | 647 | age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); |
561 | BUG_ON(age > 1); | 648 | BUG_ON(age > 1); |
562 | if (!age) | 649 | if (!age) |
563 | rb_erase(&rmap_item->node, &root_unstable_tree); | 650 | rb_erase(&rmap_item->node, |
564 | 651 | root_unstable_tree + NUMA(rmap_item->nid)); | |
565 | ksm_pages_unshared--; | 652 | ksm_pages_unshared--; |
566 | rmap_item->address &= PAGE_MASK; | 653 | rmap_item->address &= PAGE_MASK; |
567 | } | 654 | } |
@@ -581,7 +668,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot, | |||
581 | } | 668 | } |
582 | 669 | ||
583 | /* | 670 | /* |
584 | * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather | 671 | * Though it's very tempting to unmerge rmap_items from stable tree rather |
585 | * than check every pte of a given vma, the locking doesn't quite work for | 672 | * than check every pte of a given vma, the locking doesn't quite work for |
586 | * that - an rmap_item is assigned to the stable tree after inserting ksm | 673 | * that - an rmap_item is assigned to the stable tree after inserting ksm |
587 | * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing | 674 | * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing |
@@ -614,6 +701,71 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, | |||
614 | /* | 701 | /* |
615 | * Only called through the sysfs control interface: | 702 | * Only called through the sysfs control interface: |
616 | */ | 703 | */ |
704 | static int remove_stable_node(struct stable_node *stable_node) | ||
705 | { | ||
706 | struct page *page; | ||
707 | int err; | ||
708 | |||
709 | page = get_ksm_page(stable_node, true); | ||
710 | if (!page) { | ||
711 | /* | ||
712 | * get_ksm_page did remove_node_from_stable_tree itself. | ||
713 | */ | ||
714 | return 0; | ||
715 | } | ||
716 | |||
717 | if (WARN_ON_ONCE(page_mapped(page))) { | ||
718 | /* | ||
719 | * This should not happen: but if it does, just refuse to let | ||
720 | * merge_across_nodes be switched - there is no need to panic. | ||
721 | */ | ||
722 | err = -EBUSY; | ||
723 | } else { | ||
724 | /* | ||
725 | * The stable node did not yet appear stale to get_ksm_page(), | ||
726 | * since that allows for an unmapped ksm page to be recognized | ||
727 | * right up until it is freed; but the node is safe to remove. | ||
728 | * This page might be in a pagevec waiting to be freed, | ||
729 | * or it might be PageSwapCache (perhaps under writeback), | ||
730 | * or it might have been removed from swapcache a moment ago. | ||
731 | */ | ||
732 | set_page_stable_node(page, NULL); | ||
733 | remove_node_from_stable_tree(stable_node); | ||
734 | err = 0; | ||
735 | } | ||
736 | |||
737 | unlock_page(page); | ||
738 | put_page(page); | ||
739 | return err; | ||
740 | } | ||
741 | |||
742 | static int remove_all_stable_nodes(void) | ||
743 | { | ||
744 | struct stable_node *stable_node; | ||
745 | struct list_head *this, *next; | ||
746 | int nid; | ||
747 | int err = 0; | ||
748 | |||
749 | for (nid = 0; nid < ksm_nr_node_ids; nid++) { | ||
750 | while (root_stable_tree[nid].rb_node) { | ||
751 | stable_node = rb_entry(root_stable_tree[nid].rb_node, | ||
752 | struct stable_node, node); | ||
753 | if (remove_stable_node(stable_node)) { | ||
754 | err = -EBUSY; | ||
755 | break; /* proceed to next nid */ | ||
756 | } | ||
757 | cond_resched(); | ||
758 | } | ||
759 | } | ||
760 | list_for_each_safe(this, next, &migrate_nodes) { | ||
761 | stable_node = list_entry(this, struct stable_node, list); | ||
762 | if (remove_stable_node(stable_node)) | ||
763 | err = -EBUSY; | ||
764 | cond_resched(); | ||
765 | } | ||
766 | return err; | ||
767 | } | ||
768 | |||
617 | static int unmerge_and_remove_all_rmap_items(void) | 769 | static int unmerge_and_remove_all_rmap_items(void) |
618 | { | 770 | { |
619 | struct mm_slot *mm_slot; | 771 | struct mm_slot *mm_slot; |
@@ -647,7 +799,7 @@ static int unmerge_and_remove_all_rmap_items(void) | |||
647 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, | 799 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, |
648 | struct mm_slot, mm_list); | 800 | struct mm_slot, mm_list); |
649 | if (ksm_test_exit(mm)) { | 801 | if (ksm_test_exit(mm)) { |
650 | hlist_del(&mm_slot->link); | 802 | hash_del(&mm_slot->link); |
651 | list_del(&mm_slot->mm_list); | 803 | list_del(&mm_slot->mm_list); |
652 | spin_unlock(&ksm_mmlist_lock); | 804 | spin_unlock(&ksm_mmlist_lock); |
653 | 805 | ||
@@ -661,6 +813,8 @@ static int unmerge_and_remove_all_rmap_items(void) | |||
661 | } | 813 | } |
662 | } | 814 | } |
663 | 815 | ||
816 | /* Clean up stable nodes, but don't worry if some are still busy */ | ||
817 | remove_all_stable_nodes(); | ||
664 | ksm_scan.seqnr = 0; | 818 | ksm_scan.seqnr = 0; |
665 | return 0; | 819 | return 0; |
666 | 820 | ||
@@ -946,6 +1100,9 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, | |||
946 | if (err) | 1100 | if (err) |
947 | goto out; | 1101 | goto out; |
948 | 1102 | ||
1103 | /* Unstable nid is in union with stable anon_vma: remove first */ | ||
1104 | remove_rmap_item_from_tree(rmap_item); | ||
1105 | |||
949 | /* Must get reference to anon_vma while still holding mmap_sem */ | 1106 | /* Must get reference to anon_vma while still holding mmap_sem */ |
950 | rmap_item->anon_vma = vma->anon_vma; | 1107 | rmap_item->anon_vma = vma->anon_vma; |
951 | get_anon_vma(vma->anon_vma); | 1108 | get_anon_vma(vma->anon_vma); |
@@ -996,42 +1153,99 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, | |||
996 | */ | 1153 | */ |
997 | static struct page *stable_tree_search(struct page *page) | 1154 | static struct page *stable_tree_search(struct page *page) |
998 | { | 1155 | { |
999 | struct rb_node *node = root_stable_tree.rb_node; | 1156 | int nid; |
1157 | struct rb_root *root; | ||
1158 | struct rb_node **new; | ||
1159 | struct rb_node *parent; | ||
1000 | struct stable_node *stable_node; | 1160 | struct stable_node *stable_node; |
1161 | struct stable_node *page_node; | ||
1001 | 1162 | ||
1002 | stable_node = page_stable_node(page); | 1163 | page_node = page_stable_node(page); |
1003 | if (stable_node) { /* ksm page forked */ | 1164 | if (page_node && page_node->head != &migrate_nodes) { |
1165 | /* ksm page forked */ | ||
1004 | get_page(page); | 1166 | get_page(page); |
1005 | return page; | 1167 | return page; |
1006 | } | 1168 | } |
1007 | 1169 | ||
1008 | while (node) { | 1170 | nid = get_kpfn_nid(page_to_pfn(page)); |
1171 | root = root_stable_tree + nid; | ||
1172 | again: | ||
1173 | new = &root->rb_node; | ||
1174 | parent = NULL; | ||
1175 | |||
1176 | while (*new) { | ||
1009 | struct page *tree_page; | 1177 | struct page *tree_page; |
1010 | int ret; | 1178 | int ret; |
1011 | 1179 | ||
1012 | cond_resched(); | 1180 | cond_resched(); |
1013 | stable_node = rb_entry(node, struct stable_node, node); | 1181 | stable_node = rb_entry(*new, struct stable_node, node); |
1014 | tree_page = get_ksm_page(stable_node); | 1182 | tree_page = get_ksm_page(stable_node, false); |
1015 | if (!tree_page) | 1183 | if (!tree_page) |
1016 | return NULL; | 1184 | return NULL; |
1017 | 1185 | ||
1018 | ret = memcmp_pages(page, tree_page); | 1186 | ret = memcmp_pages(page, tree_page); |
1187 | put_page(tree_page); | ||
1019 | 1188 | ||
1020 | if (ret < 0) { | 1189 | parent = *new; |
1021 | put_page(tree_page); | 1190 | if (ret < 0) |
1022 | node = node->rb_left; | 1191 | new = &parent->rb_left; |
1023 | } else if (ret > 0) { | 1192 | else if (ret > 0) |
1024 | put_page(tree_page); | 1193 | new = &parent->rb_right; |
1025 | node = node->rb_right; | 1194 | else { |
1026 | } else | 1195 | /* |
1027 | return tree_page; | 1196 | * Lock and unlock the stable_node's page (which |
1197 | * might already have been migrated) so that page | ||
1198 | * migration is sure to notice its raised count. | ||
1199 | * It would be more elegant to return stable_node | ||
1200 | * than kpage, but that involves more changes. | ||
1201 | */ | ||
1202 | tree_page = get_ksm_page(stable_node, true); | ||
1203 | if (tree_page) { | ||
1204 | unlock_page(tree_page); | ||
1205 | if (get_kpfn_nid(stable_node->kpfn) != | ||
1206 | NUMA(stable_node->nid)) { | ||
1207 | put_page(tree_page); | ||
1208 | goto replace; | ||
1209 | } | ||
1210 | return tree_page; | ||
1211 | } | ||
1212 | /* | ||
1213 | * There is now a place for page_node, but the tree may | ||
1214 | * have been rebalanced, so re-evaluate parent and new. | ||
1215 | */ | ||
1216 | if (page_node) | ||
1217 | goto again; | ||
1218 | return NULL; | ||
1219 | } | ||
1028 | } | 1220 | } |
1029 | 1221 | ||
1030 | return NULL; | 1222 | if (!page_node) |
1223 | return NULL; | ||
1224 | |||
1225 | list_del(&page_node->list); | ||
1226 | DO_NUMA(page_node->nid = nid); | ||
1227 | rb_link_node(&page_node->node, parent, new); | ||
1228 | rb_insert_color(&page_node->node, root); | ||
1229 | get_page(page); | ||
1230 | return page; | ||
1231 | |||
1232 | replace: | ||
1233 | if (page_node) { | ||
1234 | list_del(&page_node->list); | ||
1235 | DO_NUMA(page_node->nid = nid); | ||
1236 | rb_replace_node(&stable_node->node, &page_node->node, root); | ||
1237 | get_page(page); | ||
1238 | } else { | ||
1239 | rb_erase(&stable_node->node, root); | ||
1240 | page = NULL; | ||
1241 | } | ||
1242 | stable_node->head = &migrate_nodes; | ||
1243 | list_add(&stable_node->list, stable_node->head); | ||
1244 | return page; | ||
1031 | } | 1245 | } |
1032 | 1246 | ||
1033 | /* | 1247 | /* |
1034 | * stable_tree_insert - insert rmap_item pointing to new ksm page | 1248 | * stable_tree_insert - insert stable tree node pointing to new ksm page |
1035 | * into the stable tree. | 1249 | * into the stable tree. |
1036 | * | 1250 | * |
1037 | * This function returns the stable tree node just allocated on success, | 1251 | * This function returns the stable tree node just allocated on success, |
@@ -1039,17 +1253,25 @@ static struct page *stable_tree_search(struct page *page) | |||
1039 | */ | 1253 | */ |
1040 | static struct stable_node *stable_tree_insert(struct page *kpage) | 1254 | static struct stable_node *stable_tree_insert(struct page *kpage) |
1041 | { | 1255 | { |
1042 | struct rb_node **new = &root_stable_tree.rb_node; | 1256 | int nid; |
1257 | unsigned long kpfn; | ||
1258 | struct rb_root *root; | ||
1259 | struct rb_node **new; | ||
1043 | struct rb_node *parent = NULL; | 1260 | struct rb_node *parent = NULL; |
1044 | struct stable_node *stable_node; | 1261 | struct stable_node *stable_node; |
1045 | 1262 | ||
1263 | kpfn = page_to_pfn(kpage); | ||
1264 | nid = get_kpfn_nid(kpfn); | ||
1265 | root = root_stable_tree + nid; | ||
1266 | new = &root->rb_node; | ||
1267 | |||
1046 | while (*new) { | 1268 | while (*new) { |
1047 | struct page *tree_page; | 1269 | struct page *tree_page; |
1048 | int ret; | 1270 | int ret; |
1049 | 1271 | ||
1050 | cond_resched(); | 1272 | cond_resched(); |
1051 | stable_node = rb_entry(*new, struct stable_node, node); | 1273 | stable_node = rb_entry(*new, struct stable_node, node); |
1052 | tree_page = get_ksm_page(stable_node); | 1274 | tree_page = get_ksm_page(stable_node, false); |
1053 | if (!tree_page) | 1275 | if (!tree_page) |
1054 | return NULL; | 1276 | return NULL; |
1055 | 1277 | ||
@@ -1075,13 +1297,12 @@ static struct stable_node *stable_tree_insert(struct page *kpage) | |||
1075 | if (!stable_node) | 1297 | if (!stable_node) |
1076 | return NULL; | 1298 | return NULL; |
1077 | 1299 | ||
1078 | rb_link_node(&stable_node->node, parent, new); | ||
1079 | rb_insert_color(&stable_node->node, &root_stable_tree); | ||
1080 | |||
1081 | INIT_HLIST_HEAD(&stable_node->hlist); | 1300 | INIT_HLIST_HEAD(&stable_node->hlist); |
1082 | 1301 | stable_node->kpfn = kpfn; | |
1083 | stable_node->kpfn = page_to_pfn(kpage); | ||
1084 | set_page_stable_node(kpage, stable_node); | 1302 | set_page_stable_node(kpage, stable_node); |
1303 | DO_NUMA(stable_node->nid = nid); | ||
1304 | rb_link_node(&stable_node->node, parent, new); | ||
1305 | rb_insert_color(&stable_node->node, root); | ||
1085 | 1306 | ||
1086 | return stable_node; | 1307 | return stable_node; |
1087 | } | 1308 | } |
@@ -1104,10 +1325,15 @@ static | |||
1104 | struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | 1325 | struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, |
1105 | struct page *page, | 1326 | struct page *page, |
1106 | struct page **tree_pagep) | 1327 | struct page **tree_pagep) |
1107 | |||
1108 | { | 1328 | { |
1109 | struct rb_node **new = &root_unstable_tree.rb_node; | 1329 | struct rb_node **new; |
1330 | struct rb_root *root; | ||
1110 | struct rb_node *parent = NULL; | 1331 | struct rb_node *parent = NULL; |
1332 | int nid; | ||
1333 | |||
1334 | nid = get_kpfn_nid(page_to_pfn(page)); | ||
1335 | root = root_unstable_tree + nid; | ||
1336 | new = &root->rb_node; | ||
1111 | 1337 | ||
1112 | while (*new) { | 1338 | while (*new) { |
1113 | struct rmap_item *tree_rmap_item; | 1339 | struct rmap_item *tree_rmap_item; |
@@ -1137,6 +1363,15 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | |||
1137 | } else if (ret > 0) { | 1363 | } else if (ret > 0) { |
1138 | put_page(tree_page); | 1364 | put_page(tree_page); |
1139 | new = &parent->rb_right; | 1365 | new = &parent->rb_right; |
1366 | } else if (!ksm_merge_across_nodes && | ||
1367 | page_to_nid(tree_page) != nid) { | ||
1368 | /* | ||
1369 | * If tree_page has been migrated to another NUMA node, | ||
1370 | * it will be flushed out and put in the right unstable | ||
1371 | * tree next time: only merge with it when across_nodes. | ||
1372 | */ | ||
1373 | put_page(tree_page); | ||
1374 | return NULL; | ||
1140 | } else { | 1375 | } else { |
1141 | *tree_pagep = tree_page; | 1376 | *tree_pagep = tree_page; |
1142 | return tree_rmap_item; | 1377 | return tree_rmap_item; |
@@ -1145,8 +1380,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | |||
1145 | 1380 | ||
1146 | rmap_item->address |= UNSTABLE_FLAG; | 1381 | rmap_item->address |= UNSTABLE_FLAG; |
1147 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); | 1382 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); |
1383 | DO_NUMA(rmap_item->nid = nid); | ||
1148 | rb_link_node(&rmap_item->node, parent, new); | 1384 | rb_link_node(&rmap_item->node, parent, new); |
1149 | rb_insert_color(&rmap_item->node, &root_unstable_tree); | 1385 | rb_insert_color(&rmap_item->node, root); |
1150 | 1386 | ||
1151 | ksm_pages_unshared++; | 1387 | ksm_pages_unshared++; |
1152 | return NULL; | 1388 | return NULL; |
@@ -1188,10 +1424,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1188 | unsigned int checksum; | 1424 | unsigned int checksum; |
1189 | int err; | 1425 | int err; |
1190 | 1426 | ||
1191 | remove_rmap_item_from_tree(rmap_item); | 1427 | stable_node = page_stable_node(page); |
1428 | if (stable_node) { | ||
1429 | if (stable_node->head != &migrate_nodes && | ||
1430 | get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { | ||
1431 | rb_erase(&stable_node->node, | ||
1432 | root_stable_tree + NUMA(stable_node->nid)); | ||
1433 | stable_node->head = &migrate_nodes; | ||
1434 | list_add(&stable_node->list, stable_node->head); | ||
1435 | } | ||
1436 | if (stable_node->head != &migrate_nodes && | ||
1437 | rmap_item->head == stable_node) | ||
1438 | return; | ||
1439 | } | ||
1192 | 1440 | ||
1193 | /* We first start with searching the page inside the stable tree */ | 1441 | /* We first start with searching the page inside the stable tree */ |
1194 | kpage = stable_tree_search(page); | 1442 | kpage = stable_tree_search(page); |
1443 | if (kpage == page && rmap_item->head == stable_node) { | ||
1444 | put_page(kpage); | ||
1445 | return; | ||
1446 | } | ||
1447 | |||
1448 | remove_rmap_item_from_tree(rmap_item); | ||
1449 | |||
1195 | if (kpage) { | 1450 | if (kpage) { |
1196 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); | 1451 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
1197 | if (!err) { | 1452 | if (!err) { |
@@ -1225,14 +1480,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1225 | kpage = try_to_merge_two_pages(rmap_item, page, | 1480 | kpage = try_to_merge_two_pages(rmap_item, page, |
1226 | tree_rmap_item, tree_page); | 1481 | tree_rmap_item, tree_page); |
1227 | put_page(tree_page); | 1482 | put_page(tree_page); |
1228 | /* | ||
1229 | * As soon as we merge this page, we want to remove the | ||
1230 | * rmap_item of the page we have merged with from the unstable | ||
1231 | * tree, and insert it instead as new node in the stable tree. | ||
1232 | */ | ||
1233 | if (kpage) { | 1483 | if (kpage) { |
1234 | remove_rmap_item_from_tree(tree_rmap_item); | 1484 | /* |
1235 | 1485 | * The pages were successfully merged: insert new | |
1486 | * node in the stable tree and add both rmap_items. | ||
1487 | */ | ||
1236 | lock_page(kpage); | 1488 | lock_page(kpage); |
1237 | stable_node = stable_tree_insert(kpage); | 1489 | stable_node = stable_tree_insert(kpage); |
1238 | if (stable_node) { | 1490 | if (stable_node) { |
@@ -1289,6 +1541,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1289 | struct mm_slot *slot; | 1541 | struct mm_slot *slot; |
1290 | struct vm_area_struct *vma; | 1542 | struct vm_area_struct *vma; |
1291 | struct rmap_item *rmap_item; | 1543 | struct rmap_item *rmap_item; |
1544 | int nid; | ||
1292 | 1545 | ||
1293 | if (list_empty(&ksm_mm_head.mm_list)) | 1546 | if (list_empty(&ksm_mm_head.mm_list)) |
1294 | return NULL; | 1547 | return NULL; |
@@ -1307,7 +1560,29 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1307 | */ | 1560 | */ |
1308 | lru_add_drain_all(); | 1561 | lru_add_drain_all(); |
1309 | 1562 | ||
1310 | root_unstable_tree = RB_ROOT; | 1563 | /* |
1564 | * Whereas stale stable_nodes on the stable_tree itself | ||
1565 | * get pruned in the regular course of stable_tree_search(), | ||
1566 | * those moved out to the migrate_nodes list can accumulate: | ||
1567 | * so prune them once before each full scan. | ||
1568 | */ | ||
1569 | if (!ksm_merge_across_nodes) { | ||
1570 | struct stable_node *stable_node; | ||
1571 | struct list_head *this, *next; | ||
1572 | struct page *page; | ||
1573 | |||
1574 | list_for_each_safe(this, next, &migrate_nodes) { | ||
1575 | stable_node = list_entry(this, | ||
1576 | struct stable_node, list); | ||
1577 | page = get_ksm_page(stable_node, false); | ||
1578 | if (page) | ||
1579 | put_page(page); | ||
1580 | cond_resched(); | ||
1581 | } | ||
1582 | } | ||
1583 | |||
1584 | for (nid = 0; nid < ksm_nr_node_ids; nid++) | ||
1585 | root_unstable_tree[nid] = RB_ROOT; | ||
1311 | 1586 | ||
1312 | spin_lock(&ksm_mmlist_lock); | 1587 | spin_lock(&ksm_mmlist_lock); |
1313 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); | 1588 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); |
@@ -1392,7 +1667,7 @@ next_mm: | |||
1392 | * or when all VM_MERGEABLE areas have been unmapped (and | 1667 | * or when all VM_MERGEABLE areas have been unmapped (and |
1393 | * mmap_sem then protects against race with MADV_MERGEABLE). | 1668 | * mmap_sem then protects against race with MADV_MERGEABLE). |
1394 | */ | 1669 | */ |
1395 | hlist_del(&slot->link); | 1670 | hash_del(&slot->link); |
1396 | list_del(&slot->mm_list); | 1671 | list_del(&slot->mm_list); |
1397 | spin_unlock(&ksm_mmlist_lock); | 1672 | spin_unlock(&ksm_mmlist_lock); |
1398 | 1673 | ||
@@ -1428,8 +1703,7 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
1428 | rmap_item = scan_get_next_rmap_item(&page); | 1703 | rmap_item = scan_get_next_rmap_item(&page); |
1429 | if (!rmap_item) | 1704 | if (!rmap_item) |
1430 | return; | 1705 | return; |
1431 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) | 1706 | cmp_and_merge_page(page, rmap_item); |
1432 | cmp_and_merge_page(page, rmap_item); | ||
1433 | put_page(page); | 1707 | put_page(page); |
1434 | } | 1708 | } |
1435 | } | 1709 | } |
@@ -1446,6 +1720,7 @@ static int ksm_scan_thread(void *nothing) | |||
1446 | 1720 | ||
1447 | while (!kthread_should_stop()) { | 1721 | while (!kthread_should_stop()) { |
1448 | mutex_lock(&ksm_thread_mutex); | 1722 | mutex_lock(&ksm_thread_mutex); |
1723 | wait_while_offlining(); | ||
1449 | if (ksmd_should_run()) | 1724 | if (ksmd_should_run()) |
1450 | ksm_do_scan(ksm_thread_pages_to_scan); | 1725 | ksm_do_scan(ksm_thread_pages_to_scan); |
1451 | mutex_unlock(&ksm_thread_mutex); | 1726 | mutex_unlock(&ksm_thread_mutex); |
@@ -1525,11 +1800,19 @@ int __ksm_enter(struct mm_struct *mm) | |||
1525 | spin_lock(&ksm_mmlist_lock); | 1800 | spin_lock(&ksm_mmlist_lock); |
1526 | insert_to_mm_slots_hash(mm, mm_slot); | 1801 | insert_to_mm_slots_hash(mm, mm_slot); |
1527 | /* | 1802 | /* |
1528 | * Insert just behind the scanning cursor, to let the area settle | 1803 | * When KSM_RUN_MERGE (or KSM_RUN_STOP), |
1804 | * insert just behind the scanning cursor, to let the area settle | ||
1529 | * down a little; when fork is followed by immediate exec, we don't | 1805 | * down a little; when fork is followed by immediate exec, we don't |
1530 | * want ksmd to waste time setting up and tearing down an rmap_list. | 1806 | * want ksmd to waste time setting up and tearing down an rmap_list. |
1807 | * | ||
1808 | * But when KSM_RUN_UNMERGE, it's important to insert ahead of its | ||
1809 | * scanning cursor, otherwise KSM pages in newly forked mms will be | ||
1810 | * missed: then we might as well insert at the end of the list. | ||
1531 | */ | 1811 | */ |
1532 | list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); | 1812 | if (ksm_run & KSM_RUN_UNMERGE) |
1813 | list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); | ||
1814 | else | ||
1815 | list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); | ||
1533 | spin_unlock(&ksm_mmlist_lock); | 1816 | spin_unlock(&ksm_mmlist_lock); |
1534 | 1817 | ||
1535 | set_bit(MMF_VM_MERGEABLE, &mm->flags); | 1818 | set_bit(MMF_VM_MERGEABLE, &mm->flags); |
@@ -1559,7 +1842,7 @@ void __ksm_exit(struct mm_struct *mm) | |||
1559 | mm_slot = get_mm_slot(mm); | 1842 | mm_slot = get_mm_slot(mm); |
1560 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { | 1843 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { |
1561 | if (!mm_slot->rmap_list) { | 1844 | if (!mm_slot->rmap_list) { |
1562 | hlist_del(&mm_slot->link); | 1845 | hash_del(&mm_slot->link); |
1563 | list_del(&mm_slot->mm_list); | 1846 | list_del(&mm_slot->mm_list); |
1564 | easy_to_free = 1; | 1847 | easy_to_free = 1; |
1565 | } else { | 1848 | } else { |
@@ -1579,24 +1862,32 @@ void __ksm_exit(struct mm_struct *mm) | |||
1579 | } | 1862 | } |
1580 | } | 1863 | } |
1581 | 1864 | ||
1582 | struct page *ksm_does_need_to_copy(struct page *page, | 1865 | struct page *ksm_might_need_to_copy(struct page *page, |
1583 | struct vm_area_struct *vma, unsigned long address) | 1866 | struct vm_area_struct *vma, unsigned long address) |
1584 | { | 1867 | { |
1868 | struct anon_vma *anon_vma = page_anon_vma(page); | ||
1585 | struct page *new_page; | 1869 | struct page *new_page; |
1586 | 1870 | ||
1871 | if (PageKsm(page)) { | ||
1872 | if (page_stable_node(page) && | ||
1873 | !(ksm_run & KSM_RUN_UNMERGE)) | ||
1874 | return page; /* no need to copy it */ | ||
1875 | } else if (!anon_vma) { | ||
1876 | return page; /* no need to copy it */ | ||
1877 | } else if (anon_vma->root == vma->anon_vma->root && | ||
1878 | page->index == linear_page_index(vma, address)) { | ||
1879 | return page; /* still no need to copy it */ | ||
1880 | } | ||
1881 | if (!PageUptodate(page)) | ||
1882 | return page; /* let do_swap_page report the error */ | ||
1883 | |||
1587 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 1884 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
1588 | if (new_page) { | 1885 | if (new_page) { |
1589 | copy_user_highpage(new_page, page, address, vma); | 1886 | copy_user_highpage(new_page, page, address, vma); |
1590 | 1887 | ||
1591 | SetPageDirty(new_page); | 1888 | SetPageDirty(new_page); |
1592 | __SetPageUptodate(new_page); | 1889 | __SetPageUptodate(new_page); |
1593 | SetPageSwapBacked(new_page); | ||
1594 | __set_page_locked(new_page); | 1890 | __set_page_locked(new_page); |
1595 | |||
1596 | if (!mlocked_vma_newpage(vma, new_page)) | ||
1597 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); | ||
1598 | else | ||
1599 | add_page_to_unevictable_list(new_page); | ||
1600 | } | 1891 | } |
1601 | 1892 | ||
1602 | return new_page; | 1893 | return new_page; |
@@ -1773,64 +2064,115 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage) | |||
1773 | if (stable_node) { | 2064 | if (stable_node) { |
1774 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); | 2065 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); |
1775 | stable_node->kpfn = page_to_pfn(newpage); | 2066 | stable_node->kpfn = page_to_pfn(newpage); |
2067 | /* | ||
2068 | * newpage->mapping was set in advance; now we need smp_wmb() | ||
2069 | * to make sure that the new stable_node->kpfn is visible | ||
2070 | * to get_ksm_page() before it can see that oldpage->mapping | ||
2071 | * has gone stale (or that PageSwapCache has been cleared). | ||
2072 | */ | ||
2073 | smp_wmb(); | ||
2074 | set_page_stable_node(oldpage, NULL); | ||
1776 | } | 2075 | } |
1777 | } | 2076 | } |
1778 | #endif /* CONFIG_MIGRATION */ | 2077 | #endif /* CONFIG_MIGRATION */ |
1779 | 2078 | ||
1780 | #ifdef CONFIG_MEMORY_HOTREMOVE | 2079 | #ifdef CONFIG_MEMORY_HOTREMOVE |
1781 | static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, | 2080 | static int just_wait(void *word) |
1782 | unsigned long end_pfn) | ||
1783 | { | 2081 | { |
1784 | struct rb_node *node; | 2082 | schedule(); |
2083 | return 0; | ||
2084 | } | ||
1785 | 2085 | ||
1786 | for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { | 2086 | static void wait_while_offlining(void) |
1787 | struct stable_node *stable_node; | 2087 | { |
2088 | while (ksm_run & KSM_RUN_OFFLINE) { | ||
2089 | mutex_unlock(&ksm_thread_mutex); | ||
2090 | wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), | ||
2091 | just_wait, TASK_UNINTERRUPTIBLE); | ||
2092 | mutex_lock(&ksm_thread_mutex); | ||
2093 | } | ||
2094 | } | ||
1788 | 2095 | ||
1789 | stable_node = rb_entry(node, struct stable_node, node); | 2096 | static void ksm_check_stable_tree(unsigned long start_pfn, |
2097 | unsigned long end_pfn) | ||
2098 | { | ||
2099 | struct stable_node *stable_node; | ||
2100 | struct list_head *this, *next; | ||
2101 | struct rb_node *node; | ||
2102 | int nid; | ||
2103 | |||
2104 | for (nid = 0; nid < ksm_nr_node_ids; nid++) { | ||
2105 | node = rb_first(root_stable_tree + nid); | ||
2106 | while (node) { | ||
2107 | stable_node = rb_entry(node, struct stable_node, node); | ||
2108 | if (stable_node->kpfn >= start_pfn && | ||
2109 | stable_node->kpfn < end_pfn) { | ||
2110 | /* | ||
2111 | * Don't get_ksm_page, page has already gone: | ||
2112 | * which is why we keep kpfn instead of page* | ||
2113 | */ | ||
2114 | remove_node_from_stable_tree(stable_node); | ||
2115 | node = rb_first(root_stable_tree + nid); | ||
2116 | } else | ||
2117 | node = rb_next(node); | ||
2118 | cond_resched(); | ||
2119 | } | ||
2120 | } | ||
2121 | list_for_each_safe(this, next, &migrate_nodes) { | ||
2122 | stable_node = list_entry(this, struct stable_node, list); | ||
1790 | if (stable_node->kpfn >= start_pfn && | 2123 | if (stable_node->kpfn >= start_pfn && |
1791 | stable_node->kpfn < end_pfn) | 2124 | stable_node->kpfn < end_pfn) |
1792 | return stable_node; | 2125 | remove_node_from_stable_tree(stable_node); |
2126 | cond_resched(); | ||
1793 | } | 2127 | } |
1794 | return NULL; | ||
1795 | } | 2128 | } |
1796 | 2129 | ||
1797 | static int ksm_memory_callback(struct notifier_block *self, | 2130 | static int ksm_memory_callback(struct notifier_block *self, |
1798 | unsigned long action, void *arg) | 2131 | unsigned long action, void *arg) |
1799 | { | 2132 | { |
1800 | struct memory_notify *mn = arg; | 2133 | struct memory_notify *mn = arg; |
1801 | struct stable_node *stable_node; | ||
1802 | 2134 | ||
1803 | switch (action) { | 2135 | switch (action) { |
1804 | case MEM_GOING_OFFLINE: | 2136 | case MEM_GOING_OFFLINE: |
1805 | /* | 2137 | /* |
1806 | * Keep it very simple for now: just lock out ksmd and | 2138 | * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() |
1807 | * MADV_UNMERGEABLE while any memory is going offline. | 2139 | * and remove_all_stable_nodes() while memory is going offline: |
1808 | * mutex_lock_nested() is necessary because lockdep was alarmed | 2140 | * it is unsafe for them to touch the stable tree at this time. |
1809 | * that here we take ksm_thread_mutex inside notifier chain | 2141 | * But unmerge_ksm_pages(), rmap lookups and other entry points |
1810 | * mutex, and later take notifier chain mutex inside | 2142 | * which do not need the ksm_thread_mutex are all safe. |
1811 | * ksm_thread_mutex to unlock it. But that's safe because both | ||
1812 | * are inside mem_hotplug_mutex. | ||
1813 | */ | 2143 | */ |
1814 | mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); | 2144 | mutex_lock(&ksm_thread_mutex); |
2145 | ksm_run |= KSM_RUN_OFFLINE; | ||
2146 | mutex_unlock(&ksm_thread_mutex); | ||
1815 | break; | 2147 | break; |
1816 | 2148 | ||
1817 | case MEM_OFFLINE: | 2149 | case MEM_OFFLINE: |
1818 | /* | 2150 | /* |
1819 | * Most of the work is done by page migration; but there might | 2151 | * Most of the work is done by page migration; but there might |
1820 | * be a few stable_nodes left over, still pointing to struct | 2152 | * be a few stable_nodes left over, still pointing to struct |
1821 | * pages which have been offlined: prune those from the tree. | 2153 | * pages which have been offlined: prune those from the tree, |
2154 | * otherwise get_ksm_page() might later try to access a | ||
2155 | * non-existent struct page. | ||
1822 | */ | 2156 | */ |
1823 | while ((stable_node = ksm_check_stable_tree(mn->start_pfn, | 2157 | ksm_check_stable_tree(mn->start_pfn, |
1824 | mn->start_pfn + mn->nr_pages)) != NULL) | 2158 | mn->start_pfn + mn->nr_pages); |
1825 | remove_node_from_stable_tree(stable_node); | ||
1826 | /* fallthrough */ | 2159 | /* fallthrough */ |
1827 | 2160 | ||
1828 | case MEM_CANCEL_OFFLINE: | 2161 | case MEM_CANCEL_OFFLINE: |
2162 | mutex_lock(&ksm_thread_mutex); | ||
2163 | ksm_run &= ~KSM_RUN_OFFLINE; | ||
1829 | mutex_unlock(&ksm_thread_mutex); | 2164 | mutex_unlock(&ksm_thread_mutex); |
2165 | |||
2166 | smp_mb(); /* wake_up_bit advises this */ | ||
2167 | wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); | ||
1830 | break; | 2168 | break; |
1831 | } | 2169 | } |
1832 | return NOTIFY_OK; | 2170 | return NOTIFY_OK; |
1833 | } | 2171 | } |
2172 | #else | ||
2173 | static void wait_while_offlining(void) | ||
2174 | { | ||
2175 | } | ||
1834 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 2176 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
1835 | 2177 | ||
1836 | #ifdef CONFIG_SYSFS | 2178 | #ifdef CONFIG_SYSFS |
@@ -1893,7 +2235,7 @@ KSM_ATTR(pages_to_scan); | |||
1893 | static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, | 2235 | static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, |
1894 | char *buf) | 2236 | char *buf) |
1895 | { | 2237 | { |
1896 | return sprintf(buf, "%u\n", ksm_run); | 2238 | return sprintf(buf, "%lu\n", ksm_run); |
1897 | } | 2239 | } |
1898 | 2240 | ||
1899 | static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | 2241 | static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, |
@@ -1916,6 +2258,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1916 | */ | 2258 | */ |
1917 | 2259 | ||
1918 | mutex_lock(&ksm_thread_mutex); | 2260 | mutex_lock(&ksm_thread_mutex); |
2261 | wait_while_offlining(); | ||
1919 | if (ksm_run != flags) { | 2262 | if (ksm_run != flags) { |
1920 | ksm_run = flags; | 2263 | ksm_run = flags; |
1921 | if (flags & KSM_RUN_UNMERGE) { | 2264 | if (flags & KSM_RUN_UNMERGE) { |
@@ -1937,6 +2280,64 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1937 | } | 2280 | } |
1938 | KSM_ATTR(run); | 2281 | KSM_ATTR(run); |
1939 | 2282 | ||
2283 | #ifdef CONFIG_NUMA | ||
2284 | static ssize_t merge_across_nodes_show(struct kobject *kobj, | ||
2285 | struct kobj_attribute *attr, char *buf) | ||
2286 | { | ||
2287 | return sprintf(buf, "%u\n", ksm_merge_across_nodes); | ||
2288 | } | ||
2289 | |||
2290 | static ssize_t merge_across_nodes_store(struct kobject *kobj, | ||
2291 | struct kobj_attribute *attr, | ||
2292 | const char *buf, size_t count) | ||
2293 | { | ||
2294 | int err; | ||
2295 | unsigned long knob; | ||
2296 | |||
2297 | err = kstrtoul(buf, 10, &knob); | ||
2298 | if (err) | ||
2299 | return err; | ||
2300 | if (knob > 1) | ||
2301 | return -EINVAL; | ||
2302 | |||
2303 | mutex_lock(&ksm_thread_mutex); | ||
2304 | wait_while_offlining(); | ||
2305 | if (ksm_merge_across_nodes != knob) { | ||
2306 | if (ksm_pages_shared || remove_all_stable_nodes()) | ||
2307 | err = -EBUSY; | ||
2308 | else if (root_stable_tree == one_stable_tree) { | ||
2309 | struct rb_root *buf; | ||
2310 | /* | ||
2311 | * This is the first time that we switch away from the | ||
2312 | * default of merging across nodes: must now allocate | ||
2313 | * a buffer to hold as many roots as may be needed. | ||
2314 | * Allocate stable and unstable together: | ||
2315 | * MAXSMP NODES_SHIFT 10 will use 16kB. | ||
2316 | */ | ||
2317 | buf = kcalloc(nr_node_ids + nr_node_ids, | ||
2318 | sizeof(*buf), GFP_KERNEL | __GFP_ZERO); | ||
2319 | /* Let us assume that RB_ROOT is NULL is zero */ | ||
2320 | if (!buf) | ||
2321 | err = -ENOMEM; | ||
2322 | else { | ||
2323 | root_stable_tree = buf; | ||
2324 | root_unstable_tree = buf + nr_node_ids; | ||
2325 | /* Stable tree is empty but not the unstable */ | ||
2326 | root_unstable_tree[0] = one_unstable_tree[0]; | ||
2327 | } | ||
2328 | } | ||
2329 | if (!err) { | ||
2330 | ksm_merge_across_nodes = knob; | ||
2331 | ksm_nr_node_ids = knob ? 1 : nr_node_ids; | ||
2332 | } | ||
2333 | } | ||
2334 | mutex_unlock(&ksm_thread_mutex); | ||
2335 | |||
2336 | return err ? err : count; | ||
2337 | } | ||
2338 | KSM_ATTR(merge_across_nodes); | ||
2339 | #endif | ||
2340 | |||
1940 | static ssize_t pages_shared_show(struct kobject *kobj, | 2341 | static ssize_t pages_shared_show(struct kobject *kobj, |
1941 | struct kobj_attribute *attr, char *buf) | 2342 | struct kobj_attribute *attr, char *buf) |
1942 | { | 2343 | { |
@@ -1991,6 +2392,9 @@ static struct attribute *ksm_attrs[] = { | |||
1991 | &pages_unshared_attr.attr, | 2392 | &pages_unshared_attr.attr, |
1992 | &pages_volatile_attr.attr, | 2393 | &pages_volatile_attr.attr, |
1993 | &full_scans_attr.attr, | 2394 | &full_scans_attr.attr, |
2395 | #ifdef CONFIG_NUMA | ||
2396 | &merge_across_nodes_attr.attr, | ||
2397 | #endif | ||
1994 | NULL, | 2398 | NULL, |
1995 | }; | 2399 | }; |
1996 | 2400 | ||
@@ -2029,10 +2433,7 @@ static int __init ksm_init(void) | |||
2029 | #endif /* CONFIG_SYSFS */ | 2433 | #endif /* CONFIG_SYSFS */ |
2030 | 2434 | ||
2031 | #ifdef CONFIG_MEMORY_HOTREMOVE | 2435 | #ifdef CONFIG_MEMORY_HOTREMOVE |
2032 | /* | 2436 | /* There is no significance to this priority 100 */ |
2033 | * Choose a high priority since the callback takes ksm_thread_mutex: | ||
2034 | * later callbacks could only be taking locks which nest within that. | ||
2035 | */ | ||
2036 | hotplug_memory_notifier(ksm_memory_callback, 100); | 2437 | hotplug_memory_notifier(ksm_memory_callback, 100); |
2037 | #endif | 2438 | #endif |
2038 | return 0; | 2439 | return 0; |