aboutsummaryrefslogtreecommitdiffstats
path: root/mm/ksm.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/ksm.c')
-rw-r--r--mm/ksm.c657
1 files changed, 529 insertions, 128 deletions
diff --git a/mm/ksm.c b/mm/ksm.c
index 51573858938d..ab2ba9ad3c59 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,13 +33,22 @@
33#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/ksm.h> 35#include <linux/ksm.h>
36#include <linux/hash.h> 36#include <linux/hashtable.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/oom.h> 38#include <linux/oom.h>
39#include <linux/numa.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include "internal.h" 42#include "internal.h"
42 43
44#ifdef CONFIG_NUMA
45#define NUMA(x) (x)
46#define DO_NUMA(x) do { (x); } while (0)
47#else
48#define NUMA(x) (0)
49#define DO_NUMA(x) do { } while (0)
50#endif
51
43/* 52/*
44 * A few notes about the KSM scanning process, 53 * A few notes about the KSM scanning process,
45 * to make it easier to understand the data structures below: 54 * to make it easier to understand the data structures below:
@@ -78,6 +87,9 @@
78 * take 10 attempts to find a page in the unstable tree, once it is found, 87 * take 10 attempts to find a page in the unstable tree, once it is found,
79 * it is secured in the stable tree. (When we scan a new page, we first 88 * it is secured in the stable tree. (When we scan a new page, we first
80 * compare it against the stable tree, and then against the unstable tree.) 89 * compare it against the stable tree, and then against the unstable tree.)
90 *
91 * If the merge_across_nodes tunable is unset, then KSM maintains multiple
92 * stable trees and multiple unstable trees: one of each for each NUMA node.
81 */ 93 */
82 94
83/** 95/**
@@ -113,19 +125,32 @@ struct ksm_scan {
113/** 125/**
114 * struct stable_node - node of the stable rbtree 126 * struct stable_node - node of the stable rbtree
115 * @node: rb node of this ksm page in the stable tree 127 * @node: rb node of this ksm page in the stable tree
128 * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
129 * @list: linked into migrate_nodes, pending placement in the proper node tree
116 * @hlist: hlist head of rmap_items using this ksm page 130 * @hlist: hlist head of rmap_items using this ksm page
117 * @kpfn: page frame number of this ksm page 131 * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
132 * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
118 */ 133 */
119struct stable_node { 134struct stable_node {
120 struct rb_node node; 135 union {
136 struct rb_node node; /* when node of stable tree */
137 struct { /* when listed for migration */
138 struct list_head *head;
139 struct list_head list;
140 };
141 };
121 struct hlist_head hlist; 142 struct hlist_head hlist;
122 unsigned long kpfn; 143 unsigned long kpfn;
144#ifdef CONFIG_NUMA
145 int nid;
146#endif
123}; 147};
124 148
125/** 149/**
126 * struct rmap_item - reverse mapping item for virtual addresses 150 * struct rmap_item - reverse mapping item for virtual addresses
127 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list 151 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
128 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree 152 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
153 * @nid: NUMA node id of unstable tree in which linked (may not match page)
129 * @mm: the memory structure this rmap_item is pointing into 154 * @mm: the memory structure this rmap_item is pointing into
130 * @address: the virtual address this rmap_item tracks (+ flags in low bits) 155 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
131 * @oldchecksum: previous checksum of the page at that virtual address 156 * @oldchecksum: previous checksum of the page at that virtual address
@@ -135,7 +160,12 @@ struct stable_node {
135 */ 160 */
136struct rmap_item { 161struct rmap_item {
137 struct rmap_item *rmap_list; 162 struct rmap_item *rmap_list;
138 struct anon_vma *anon_vma; /* when stable */ 163 union {
164 struct anon_vma *anon_vma; /* when stable */
165#ifdef CONFIG_NUMA
166 int nid; /* when node of unstable tree */
167#endif
168 };
139 struct mm_struct *mm; 169 struct mm_struct *mm;
140 unsigned long address; /* + low bits used for flags below */ 170 unsigned long address; /* + low bits used for flags below */
141 unsigned int oldchecksum; /* when unstable */ 171 unsigned int oldchecksum; /* when unstable */
@@ -153,12 +183,16 @@ struct rmap_item {
153#define STABLE_FLAG 0x200 /* is listed from the stable tree */ 183#define STABLE_FLAG 0x200 /* is listed from the stable tree */
154 184
155/* The stable and unstable tree heads */ 185/* The stable and unstable tree heads */
156static struct rb_root root_stable_tree = RB_ROOT; 186static struct rb_root one_stable_tree[1] = { RB_ROOT };
157static struct rb_root root_unstable_tree = RB_ROOT; 187static struct rb_root one_unstable_tree[1] = { RB_ROOT };
188static struct rb_root *root_stable_tree = one_stable_tree;
189static struct rb_root *root_unstable_tree = one_unstable_tree;
158 190
159#define MM_SLOTS_HASH_SHIFT 10 191/* Recently migrated nodes of stable tree, pending proper placement */
160#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) 192static LIST_HEAD(migrate_nodes);
161static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; 193
194#define MM_SLOTS_HASH_BITS 10
195static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
162 196
163static struct mm_slot ksm_mm_head = { 197static struct mm_slot ksm_mm_head = {
164 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), 198 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -189,10 +223,21 @@ static unsigned int ksm_thread_pages_to_scan = 100;
189/* Milliseconds ksmd should sleep between batches */ 223/* Milliseconds ksmd should sleep between batches */
190static unsigned int ksm_thread_sleep_millisecs = 20; 224static unsigned int ksm_thread_sleep_millisecs = 20;
191 225
226#ifdef CONFIG_NUMA
227/* Zeroed when merging across nodes is not allowed */
228static unsigned int ksm_merge_across_nodes = 1;
229static int ksm_nr_node_ids = 1;
230#else
231#define ksm_merge_across_nodes 1U
232#define ksm_nr_node_ids 1
233#endif
234
192#define KSM_RUN_STOP 0 235#define KSM_RUN_STOP 0
193#define KSM_RUN_MERGE 1 236#define KSM_RUN_MERGE 1
194#define KSM_RUN_UNMERGE 2 237#define KSM_RUN_UNMERGE 2
195static unsigned int ksm_run = KSM_RUN_STOP; 238#define KSM_RUN_OFFLINE 4
239static unsigned long ksm_run = KSM_RUN_STOP;
240static void wait_while_offlining(void);
196 241
197static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); 242static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
198static DEFINE_MUTEX(ksm_thread_mutex); 243static DEFINE_MUTEX(ksm_thread_mutex);
@@ -275,31 +320,21 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
275 320
276static struct mm_slot *get_mm_slot(struct mm_struct *mm) 321static struct mm_slot *get_mm_slot(struct mm_struct *mm)
277{ 322{
278 struct mm_slot *mm_slot;
279 struct hlist_head *bucket;
280 struct hlist_node *node; 323 struct hlist_node *node;
324 struct mm_slot *slot;
325
326 hash_for_each_possible(mm_slots_hash, slot, node, link, (unsigned long)mm)
327 if (slot->mm == mm)
328 return slot;
281 329
282 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
283 hlist_for_each_entry(mm_slot, node, bucket, link) {
284 if (mm == mm_slot->mm)
285 return mm_slot;
286 }
287 return NULL; 330 return NULL;
288} 331}
289 332
290static void insert_to_mm_slots_hash(struct mm_struct *mm, 333static void insert_to_mm_slots_hash(struct mm_struct *mm,
291 struct mm_slot *mm_slot) 334 struct mm_slot *mm_slot)
292{ 335{
293 struct hlist_head *bucket;
294
295 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
296 mm_slot->mm = mm; 336 mm_slot->mm = mm;
297 hlist_add_head(&mm_slot->link, bucket); 337 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
298}
299
300static inline int in_stable_tree(struct rmap_item *rmap_item)
301{
302 return rmap_item->address & STABLE_FLAG;
303} 338}
304 339
305/* 340/*
@@ -333,7 +368,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
333 368
334 do { 369 do {
335 cond_resched(); 370 cond_resched();
336 page = follow_page(vma, addr, FOLL_GET); 371 page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
337 if (IS_ERR_OR_NULL(page)) 372 if (IS_ERR_OR_NULL(page))
338 break; 373 break;
339 if (PageKsm(page)) 374 if (PageKsm(page))
@@ -447,6 +482,17 @@ out: page = NULL;
447 return page; 482 return page;
448} 483}
449 484
485/*
486 * This helper is used for getting right index into array of tree roots.
487 * When merge_across_nodes knob is set to 1, there are only two rb-trees for
488 * stable and unstable pages from all nodes with roots in index 0. Otherwise,
489 * every node has its own stable and unstable tree.
490 */
491static inline int get_kpfn_nid(unsigned long kpfn)
492{
493 return ksm_merge_across_nodes ? 0 : pfn_to_nid(kpfn);
494}
495
450static void remove_node_from_stable_tree(struct stable_node *stable_node) 496static void remove_node_from_stable_tree(struct stable_node *stable_node)
451{ 497{
452 struct rmap_item *rmap_item; 498 struct rmap_item *rmap_item;
@@ -462,7 +508,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
462 cond_resched(); 508 cond_resched();
463 } 509 }
464 510
465 rb_erase(&stable_node->node, &root_stable_tree); 511 if (stable_node->head == &migrate_nodes)
512 list_del(&stable_node->list);
513 else
514 rb_erase(&stable_node->node,
515 root_stable_tree + NUMA(stable_node->nid));
466 free_stable_node(stable_node); 516 free_stable_node(stable_node);
467} 517}
468 518
@@ -472,6 +522,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
472 * In which case we can trust the content of the page, and it 522 * In which case we can trust the content of the page, and it
473 * returns the gotten page; but if the page has now been zapped, 523 * returns the gotten page; but if the page has now been zapped,
474 * remove the stale node from the stable tree and return NULL. 524 * remove the stale node from the stable tree and return NULL.
525 * But beware, the stable node's page might be being migrated.
475 * 526 *
476 * You would expect the stable_node to hold a reference to the ksm page. 527 * You would expect the stable_node to hold a reference to the ksm page.
477 * But if it increments the page's count, swapping out has to wait for 528 * But if it increments the page's count, swapping out has to wait for
@@ -482,40 +533,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
482 * pointing back to this stable node. This relies on freeing a PageAnon 533 * pointing back to this stable node. This relies on freeing a PageAnon
483 * page to reset its page->mapping to NULL, and relies on no other use of 534 * page to reset its page->mapping to NULL, and relies on no other use of
484 * a page to put something that might look like our key in page->mapping. 535 * a page to put something that might look like our key in page->mapping.
485 *
486 * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
487 * but this is different - made simpler by ksm_thread_mutex being held, but
488 * interesting for assuming that no other use of the struct page could ever
489 * put our expected_mapping into page->mapping (or a field of the union which
490 * coincides with page->mapping). The RCU calls are not for KSM at all, but
491 * to keep the page_count protocol described with page_cache_get_speculative.
492 *
493 * Note: it is possible that get_ksm_page() will return NULL one moment,
494 * then page the next, if the page is in between page_freeze_refs() and
495 * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
496 * is on its way to being freed; but it is an anomaly to bear in mind. 536 * is on its way to being freed; but it is an anomaly to bear in mind.
497 */ 537 */
498static struct page *get_ksm_page(struct stable_node *stable_node) 538static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
499{ 539{
500 struct page *page; 540 struct page *page;
501 void *expected_mapping; 541 void *expected_mapping;
542 unsigned long kpfn;
502 543
503 page = pfn_to_page(stable_node->kpfn);
504 expected_mapping = (void *)stable_node + 544 expected_mapping = (void *)stable_node +
505 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); 545 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
506 rcu_read_lock(); 546again:
507 if (page->mapping != expected_mapping) 547 kpfn = ACCESS_ONCE(stable_node->kpfn);
508 goto stale; 548 page = pfn_to_page(kpfn);
509 if (!get_page_unless_zero(page)) 549
550 /*
551 * page is computed from kpfn, so on most architectures reading
552 * page->mapping is naturally ordered after reading node->kpfn,
553 * but on Alpha we need to be more careful.
554 */
555 smp_read_barrier_depends();
556 if (ACCESS_ONCE(page->mapping) != expected_mapping)
510 goto stale; 557 goto stale;
511 if (page->mapping != expected_mapping) { 558
559 /*
560 * We cannot do anything with the page while its refcount is 0.
561 * Usually 0 means free, or tail of a higher-order page: in which
562 * case this node is no longer referenced, and should be freed;
563 * however, it might mean that the page is under page_freeze_refs().
564 * The __remove_mapping() case is easy, again the node is now stale;
565 * but if page is swapcache in migrate_page_move_mapping(), it might
566 * still be our page, in which case it's essential to keep the node.
567 */
568 while (!get_page_unless_zero(page)) {
569 /*
570 * Another check for page->mapping != expected_mapping would
571 * work here too. We have chosen the !PageSwapCache test to
572 * optimize the common case, when the page is or is about to
573 * be freed: PageSwapCache is cleared (under spin_lock_irq)
574 * in the freeze_refs section of __remove_mapping(); but Anon
575 * page->mapping reset to NULL later, in free_pages_prepare().
576 */
577 if (!PageSwapCache(page))
578 goto stale;
579 cpu_relax();
580 }
581
582 if (ACCESS_ONCE(page->mapping) != expected_mapping) {
512 put_page(page); 583 put_page(page);
513 goto stale; 584 goto stale;
514 } 585 }
515 rcu_read_unlock(); 586
587 if (lock_it) {
588 lock_page(page);
589 if (ACCESS_ONCE(page->mapping) != expected_mapping) {
590 unlock_page(page);
591 put_page(page);
592 goto stale;
593 }
594 }
516 return page; 595 return page;
596
517stale: 597stale:
518 rcu_read_unlock(); 598 /*
599 * We come here from above when page->mapping or !PageSwapCache
600 * suggests that the node is stale; but it might be under migration.
601 * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
602 * before checking whether node->kpfn has been changed.
603 */
604 smp_rmb();
605 if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
606 goto again;
519 remove_node_from_stable_tree(stable_node); 607 remove_node_from_stable_tree(stable_node);
520 return NULL; 608 return NULL;
521} 609}
@@ -531,11 +619,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
531 struct page *page; 619 struct page *page;
532 620
533 stable_node = rmap_item->head; 621 stable_node = rmap_item->head;
534 page = get_ksm_page(stable_node); 622 page = get_ksm_page(stable_node, true);
535 if (!page) 623 if (!page)
536 goto out; 624 goto out;
537 625
538 lock_page(page);
539 hlist_del(&rmap_item->hlist); 626 hlist_del(&rmap_item->hlist);
540 unlock_page(page); 627 unlock_page(page);
541 put_page(page); 628 put_page(page);
@@ -560,8 +647,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
560 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); 647 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
561 BUG_ON(age > 1); 648 BUG_ON(age > 1);
562 if (!age) 649 if (!age)
563 rb_erase(&rmap_item->node, &root_unstable_tree); 650 rb_erase(&rmap_item->node,
564 651 root_unstable_tree + NUMA(rmap_item->nid));
565 ksm_pages_unshared--; 652 ksm_pages_unshared--;
566 rmap_item->address &= PAGE_MASK; 653 rmap_item->address &= PAGE_MASK;
567 } 654 }
@@ -581,7 +668,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
581} 668}
582 669
583/* 670/*
584 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather 671 * Though it's very tempting to unmerge rmap_items from stable tree rather
585 * than check every pte of a given vma, the locking doesn't quite work for 672 * than check every pte of a given vma, the locking doesn't quite work for
586 * that - an rmap_item is assigned to the stable tree after inserting ksm 673 * that - an rmap_item is assigned to the stable tree after inserting ksm
587 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing 674 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
@@ -614,6 +701,71 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
614/* 701/*
615 * Only called through the sysfs control interface: 702 * Only called through the sysfs control interface:
616 */ 703 */
704static int remove_stable_node(struct stable_node *stable_node)
705{
706 struct page *page;
707 int err;
708
709 page = get_ksm_page(stable_node, true);
710 if (!page) {
711 /*
712 * get_ksm_page did remove_node_from_stable_tree itself.
713 */
714 return 0;
715 }
716
717 if (WARN_ON_ONCE(page_mapped(page))) {
718 /*
719 * This should not happen: but if it does, just refuse to let
720 * merge_across_nodes be switched - there is no need to panic.
721 */
722 err = -EBUSY;
723 } else {
724 /*
725 * The stable node did not yet appear stale to get_ksm_page(),
726 * since that allows for an unmapped ksm page to be recognized
727 * right up until it is freed; but the node is safe to remove.
728 * This page might be in a pagevec waiting to be freed,
729 * or it might be PageSwapCache (perhaps under writeback),
730 * or it might have been removed from swapcache a moment ago.
731 */
732 set_page_stable_node(page, NULL);
733 remove_node_from_stable_tree(stable_node);
734 err = 0;
735 }
736
737 unlock_page(page);
738 put_page(page);
739 return err;
740}
741
742static int remove_all_stable_nodes(void)
743{
744 struct stable_node *stable_node;
745 struct list_head *this, *next;
746 int nid;
747 int err = 0;
748
749 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
750 while (root_stable_tree[nid].rb_node) {
751 stable_node = rb_entry(root_stable_tree[nid].rb_node,
752 struct stable_node, node);
753 if (remove_stable_node(stable_node)) {
754 err = -EBUSY;
755 break; /* proceed to next nid */
756 }
757 cond_resched();
758 }
759 }
760 list_for_each_safe(this, next, &migrate_nodes) {
761 stable_node = list_entry(this, struct stable_node, list);
762 if (remove_stable_node(stable_node))
763 err = -EBUSY;
764 cond_resched();
765 }
766 return err;
767}
768
617static int unmerge_and_remove_all_rmap_items(void) 769static int unmerge_and_remove_all_rmap_items(void)
618{ 770{
619 struct mm_slot *mm_slot; 771 struct mm_slot *mm_slot;
@@ -647,7 +799,7 @@ static int unmerge_and_remove_all_rmap_items(void)
647 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, 799 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
648 struct mm_slot, mm_list); 800 struct mm_slot, mm_list);
649 if (ksm_test_exit(mm)) { 801 if (ksm_test_exit(mm)) {
650 hlist_del(&mm_slot->link); 802 hash_del(&mm_slot->link);
651 list_del(&mm_slot->mm_list); 803 list_del(&mm_slot->mm_list);
652 spin_unlock(&ksm_mmlist_lock); 804 spin_unlock(&ksm_mmlist_lock);
653 805
@@ -661,6 +813,8 @@ static int unmerge_and_remove_all_rmap_items(void)
661 } 813 }
662 } 814 }
663 815
816 /* Clean up stable nodes, but don't worry if some are still busy */
817 remove_all_stable_nodes();
664 ksm_scan.seqnr = 0; 818 ksm_scan.seqnr = 0;
665 return 0; 819 return 0;
666 820
@@ -946,6 +1100,9 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
946 if (err) 1100 if (err)
947 goto out; 1101 goto out;
948 1102
1103 /* Unstable nid is in union with stable anon_vma: remove first */
1104 remove_rmap_item_from_tree(rmap_item);
1105
949 /* Must get reference to anon_vma while still holding mmap_sem */ 1106 /* Must get reference to anon_vma while still holding mmap_sem */
950 rmap_item->anon_vma = vma->anon_vma; 1107 rmap_item->anon_vma = vma->anon_vma;
951 get_anon_vma(vma->anon_vma); 1108 get_anon_vma(vma->anon_vma);
@@ -996,42 +1153,99 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
996 */ 1153 */
997static struct page *stable_tree_search(struct page *page) 1154static struct page *stable_tree_search(struct page *page)
998{ 1155{
999 struct rb_node *node = root_stable_tree.rb_node; 1156 int nid;
1157 struct rb_root *root;
1158 struct rb_node **new;
1159 struct rb_node *parent;
1000 struct stable_node *stable_node; 1160 struct stable_node *stable_node;
1161 struct stable_node *page_node;
1001 1162
1002 stable_node = page_stable_node(page); 1163 page_node = page_stable_node(page);
1003 if (stable_node) { /* ksm page forked */ 1164 if (page_node && page_node->head != &migrate_nodes) {
1165 /* ksm page forked */
1004 get_page(page); 1166 get_page(page);
1005 return page; 1167 return page;
1006 } 1168 }
1007 1169
1008 while (node) { 1170 nid = get_kpfn_nid(page_to_pfn(page));
1171 root = root_stable_tree + nid;
1172again:
1173 new = &root->rb_node;
1174 parent = NULL;
1175
1176 while (*new) {
1009 struct page *tree_page; 1177 struct page *tree_page;
1010 int ret; 1178 int ret;
1011 1179
1012 cond_resched(); 1180 cond_resched();
1013 stable_node = rb_entry(node, struct stable_node, node); 1181 stable_node = rb_entry(*new, struct stable_node, node);
1014 tree_page = get_ksm_page(stable_node); 1182 tree_page = get_ksm_page(stable_node, false);
1015 if (!tree_page) 1183 if (!tree_page)
1016 return NULL; 1184 return NULL;
1017 1185
1018 ret = memcmp_pages(page, tree_page); 1186 ret = memcmp_pages(page, tree_page);
1187 put_page(tree_page);
1019 1188
1020 if (ret < 0) { 1189 parent = *new;
1021 put_page(tree_page); 1190 if (ret < 0)
1022 node = node->rb_left; 1191 new = &parent->rb_left;
1023 } else if (ret > 0) { 1192 else if (ret > 0)
1024 put_page(tree_page); 1193 new = &parent->rb_right;
1025 node = node->rb_right; 1194 else {
1026 } else 1195 /*
1027 return tree_page; 1196 * Lock and unlock the stable_node's page (which
1197 * might already have been migrated) so that page
1198 * migration is sure to notice its raised count.
1199 * It would be more elegant to return stable_node
1200 * than kpage, but that involves more changes.
1201 */
1202 tree_page = get_ksm_page(stable_node, true);
1203 if (tree_page) {
1204 unlock_page(tree_page);
1205 if (get_kpfn_nid(stable_node->kpfn) !=
1206 NUMA(stable_node->nid)) {
1207 put_page(tree_page);
1208 goto replace;
1209 }
1210 return tree_page;
1211 }
1212 /*
1213 * There is now a place for page_node, but the tree may
1214 * have been rebalanced, so re-evaluate parent and new.
1215 */
1216 if (page_node)
1217 goto again;
1218 return NULL;
1219 }
1028 } 1220 }
1029 1221
1030 return NULL; 1222 if (!page_node)
1223 return NULL;
1224
1225 list_del(&page_node->list);
1226 DO_NUMA(page_node->nid = nid);
1227 rb_link_node(&page_node->node, parent, new);
1228 rb_insert_color(&page_node->node, root);
1229 get_page(page);
1230 return page;
1231
1232replace:
1233 if (page_node) {
1234 list_del(&page_node->list);
1235 DO_NUMA(page_node->nid = nid);
1236 rb_replace_node(&stable_node->node, &page_node->node, root);
1237 get_page(page);
1238 } else {
1239 rb_erase(&stable_node->node, root);
1240 page = NULL;
1241 }
1242 stable_node->head = &migrate_nodes;
1243 list_add(&stable_node->list, stable_node->head);
1244 return page;
1031} 1245}
1032 1246
1033/* 1247/*
1034 * stable_tree_insert - insert rmap_item pointing to new ksm page 1248 * stable_tree_insert - insert stable tree node pointing to new ksm page
1035 * into the stable tree. 1249 * into the stable tree.
1036 * 1250 *
1037 * This function returns the stable tree node just allocated on success, 1251 * This function returns the stable tree node just allocated on success,
@@ -1039,17 +1253,25 @@ static struct page *stable_tree_search(struct page *page)
1039 */ 1253 */
1040static struct stable_node *stable_tree_insert(struct page *kpage) 1254static struct stable_node *stable_tree_insert(struct page *kpage)
1041{ 1255{
1042 struct rb_node **new = &root_stable_tree.rb_node; 1256 int nid;
1257 unsigned long kpfn;
1258 struct rb_root *root;
1259 struct rb_node **new;
1043 struct rb_node *parent = NULL; 1260 struct rb_node *parent = NULL;
1044 struct stable_node *stable_node; 1261 struct stable_node *stable_node;
1045 1262
1263 kpfn = page_to_pfn(kpage);
1264 nid = get_kpfn_nid(kpfn);
1265 root = root_stable_tree + nid;
1266 new = &root->rb_node;
1267
1046 while (*new) { 1268 while (*new) {
1047 struct page *tree_page; 1269 struct page *tree_page;
1048 int ret; 1270 int ret;
1049 1271
1050 cond_resched(); 1272 cond_resched();
1051 stable_node = rb_entry(*new, struct stable_node, node); 1273 stable_node = rb_entry(*new, struct stable_node, node);
1052 tree_page = get_ksm_page(stable_node); 1274 tree_page = get_ksm_page(stable_node, false);
1053 if (!tree_page) 1275 if (!tree_page)
1054 return NULL; 1276 return NULL;
1055 1277
@@ -1075,13 +1297,12 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
1075 if (!stable_node) 1297 if (!stable_node)
1076 return NULL; 1298 return NULL;
1077 1299
1078 rb_link_node(&stable_node->node, parent, new);
1079 rb_insert_color(&stable_node->node, &root_stable_tree);
1080
1081 INIT_HLIST_HEAD(&stable_node->hlist); 1300 INIT_HLIST_HEAD(&stable_node->hlist);
1082 1301 stable_node->kpfn = kpfn;
1083 stable_node->kpfn = page_to_pfn(kpage);
1084 set_page_stable_node(kpage, stable_node); 1302 set_page_stable_node(kpage, stable_node);
1303 DO_NUMA(stable_node->nid = nid);
1304 rb_link_node(&stable_node->node, parent, new);
1305 rb_insert_color(&stable_node->node, root);
1085 1306
1086 return stable_node; 1307 return stable_node;
1087} 1308}
@@ -1104,10 +1325,15 @@ static
1104struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, 1325struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1105 struct page *page, 1326 struct page *page,
1106 struct page **tree_pagep) 1327 struct page **tree_pagep)
1107
1108{ 1328{
1109 struct rb_node **new = &root_unstable_tree.rb_node; 1329 struct rb_node **new;
1330 struct rb_root *root;
1110 struct rb_node *parent = NULL; 1331 struct rb_node *parent = NULL;
1332 int nid;
1333
1334 nid = get_kpfn_nid(page_to_pfn(page));
1335 root = root_unstable_tree + nid;
1336 new = &root->rb_node;
1111 1337
1112 while (*new) { 1338 while (*new) {
1113 struct rmap_item *tree_rmap_item; 1339 struct rmap_item *tree_rmap_item;
@@ -1137,6 +1363,15 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1137 } else if (ret > 0) { 1363 } else if (ret > 0) {
1138 put_page(tree_page); 1364 put_page(tree_page);
1139 new = &parent->rb_right; 1365 new = &parent->rb_right;
1366 } else if (!ksm_merge_across_nodes &&
1367 page_to_nid(tree_page) != nid) {
1368 /*
1369 * If tree_page has been migrated to another NUMA node,
1370 * it will be flushed out and put in the right unstable
1371 * tree next time: only merge with it when across_nodes.
1372 */
1373 put_page(tree_page);
1374 return NULL;
1140 } else { 1375 } else {
1141 *tree_pagep = tree_page; 1376 *tree_pagep = tree_page;
1142 return tree_rmap_item; 1377 return tree_rmap_item;
@@ -1145,8 +1380,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1145 1380
1146 rmap_item->address |= UNSTABLE_FLAG; 1381 rmap_item->address |= UNSTABLE_FLAG;
1147 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 1382 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1383 DO_NUMA(rmap_item->nid = nid);
1148 rb_link_node(&rmap_item->node, parent, new); 1384 rb_link_node(&rmap_item->node, parent, new);
1149 rb_insert_color(&rmap_item->node, &root_unstable_tree); 1385 rb_insert_color(&rmap_item->node, root);
1150 1386
1151 ksm_pages_unshared++; 1387 ksm_pages_unshared++;
1152 return NULL; 1388 return NULL;
@@ -1188,10 +1424,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1188 unsigned int checksum; 1424 unsigned int checksum;
1189 int err; 1425 int err;
1190 1426
1191 remove_rmap_item_from_tree(rmap_item); 1427 stable_node = page_stable_node(page);
1428 if (stable_node) {
1429 if (stable_node->head != &migrate_nodes &&
1430 get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
1431 rb_erase(&stable_node->node,
1432 root_stable_tree + NUMA(stable_node->nid));
1433 stable_node->head = &migrate_nodes;
1434 list_add(&stable_node->list, stable_node->head);
1435 }
1436 if (stable_node->head != &migrate_nodes &&
1437 rmap_item->head == stable_node)
1438 return;
1439 }
1192 1440
1193 /* We first start with searching the page inside the stable tree */ 1441 /* We first start with searching the page inside the stable tree */
1194 kpage = stable_tree_search(page); 1442 kpage = stable_tree_search(page);
1443 if (kpage == page && rmap_item->head == stable_node) {
1444 put_page(kpage);
1445 return;
1446 }
1447
1448 remove_rmap_item_from_tree(rmap_item);
1449
1195 if (kpage) { 1450 if (kpage) {
1196 err = try_to_merge_with_ksm_page(rmap_item, page, kpage); 1451 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1197 if (!err) { 1452 if (!err) {
@@ -1225,14 +1480,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1225 kpage = try_to_merge_two_pages(rmap_item, page, 1480 kpage = try_to_merge_two_pages(rmap_item, page,
1226 tree_rmap_item, tree_page); 1481 tree_rmap_item, tree_page);
1227 put_page(tree_page); 1482 put_page(tree_page);
1228 /*
1229 * As soon as we merge this page, we want to remove the
1230 * rmap_item of the page we have merged with from the unstable
1231 * tree, and insert it instead as new node in the stable tree.
1232 */
1233 if (kpage) { 1483 if (kpage) {
1234 remove_rmap_item_from_tree(tree_rmap_item); 1484 /*
1235 1485 * The pages were successfully merged: insert new
1486 * node in the stable tree and add both rmap_items.
1487 */
1236 lock_page(kpage); 1488 lock_page(kpage);
1237 stable_node = stable_tree_insert(kpage); 1489 stable_node = stable_tree_insert(kpage);
1238 if (stable_node) { 1490 if (stable_node) {
@@ -1289,6 +1541,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1289 struct mm_slot *slot; 1541 struct mm_slot *slot;
1290 struct vm_area_struct *vma; 1542 struct vm_area_struct *vma;
1291 struct rmap_item *rmap_item; 1543 struct rmap_item *rmap_item;
1544 int nid;
1292 1545
1293 if (list_empty(&ksm_mm_head.mm_list)) 1546 if (list_empty(&ksm_mm_head.mm_list))
1294 return NULL; 1547 return NULL;
@@ -1307,7 +1560,29 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1307 */ 1560 */
1308 lru_add_drain_all(); 1561 lru_add_drain_all();
1309 1562
1310 root_unstable_tree = RB_ROOT; 1563 /*
1564 * Whereas stale stable_nodes on the stable_tree itself
1565 * get pruned in the regular course of stable_tree_search(),
1566 * those moved out to the migrate_nodes list can accumulate:
1567 * so prune them once before each full scan.
1568 */
1569 if (!ksm_merge_across_nodes) {
1570 struct stable_node *stable_node;
1571 struct list_head *this, *next;
1572 struct page *page;
1573
1574 list_for_each_safe(this, next, &migrate_nodes) {
1575 stable_node = list_entry(this,
1576 struct stable_node, list);
1577 page = get_ksm_page(stable_node, false);
1578 if (page)
1579 put_page(page);
1580 cond_resched();
1581 }
1582 }
1583
1584 for (nid = 0; nid < ksm_nr_node_ids; nid++)
1585 root_unstable_tree[nid] = RB_ROOT;
1311 1586
1312 spin_lock(&ksm_mmlist_lock); 1587 spin_lock(&ksm_mmlist_lock);
1313 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1588 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
@@ -1392,7 +1667,7 @@ next_mm:
1392 * or when all VM_MERGEABLE areas have been unmapped (and 1667 * or when all VM_MERGEABLE areas have been unmapped (and
1393 * mmap_sem then protects against race with MADV_MERGEABLE). 1668 * mmap_sem then protects against race with MADV_MERGEABLE).
1394 */ 1669 */
1395 hlist_del(&slot->link); 1670 hash_del(&slot->link);
1396 list_del(&slot->mm_list); 1671 list_del(&slot->mm_list);
1397 spin_unlock(&ksm_mmlist_lock); 1672 spin_unlock(&ksm_mmlist_lock);
1398 1673
@@ -1428,8 +1703,7 @@ static void ksm_do_scan(unsigned int scan_npages)
1428 rmap_item = scan_get_next_rmap_item(&page); 1703 rmap_item = scan_get_next_rmap_item(&page);
1429 if (!rmap_item) 1704 if (!rmap_item)
1430 return; 1705 return;
1431 if (!PageKsm(page) || !in_stable_tree(rmap_item)) 1706 cmp_and_merge_page(page, rmap_item);
1432 cmp_and_merge_page(page, rmap_item);
1433 put_page(page); 1707 put_page(page);
1434 } 1708 }
1435} 1709}
@@ -1446,6 +1720,7 @@ static int ksm_scan_thread(void *nothing)
1446 1720
1447 while (!kthread_should_stop()) { 1721 while (!kthread_should_stop()) {
1448 mutex_lock(&ksm_thread_mutex); 1722 mutex_lock(&ksm_thread_mutex);
1723 wait_while_offlining();
1449 if (ksmd_should_run()) 1724 if (ksmd_should_run())
1450 ksm_do_scan(ksm_thread_pages_to_scan); 1725 ksm_do_scan(ksm_thread_pages_to_scan);
1451 mutex_unlock(&ksm_thread_mutex); 1726 mutex_unlock(&ksm_thread_mutex);
@@ -1525,11 +1800,19 @@ int __ksm_enter(struct mm_struct *mm)
1525 spin_lock(&ksm_mmlist_lock); 1800 spin_lock(&ksm_mmlist_lock);
1526 insert_to_mm_slots_hash(mm, mm_slot); 1801 insert_to_mm_slots_hash(mm, mm_slot);
1527 /* 1802 /*
1528 * Insert just behind the scanning cursor, to let the area settle 1803 * When KSM_RUN_MERGE (or KSM_RUN_STOP),
1804 * insert just behind the scanning cursor, to let the area settle
1529 * down a little; when fork is followed by immediate exec, we don't 1805 * down a little; when fork is followed by immediate exec, we don't
1530 * want ksmd to waste time setting up and tearing down an rmap_list. 1806 * want ksmd to waste time setting up and tearing down an rmap_list.
1807 *
1808 * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
1809 * scanning cursor, otherwise KSM pages in newly forked mms will be
1810 * missed: then we might as well insert at the end of the list.
1531 */ 1811 */
1532 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); 1812 if (ksm_run & KSM_RUN_UNMERGE)
1813 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
1814 else
1815 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1533 spin_unlock(&ksm_mmlist_lock); 1816 spin_unlock(&ksm_mmlist_lock);
1534 1817
1535 set_bit(MMF_VM_MERGEABLE, &mm->flags); 1818 set_bit(MMF_VM_MERGEABLE, &mm->flags);
@@ -1559,7 +1842,7 @@ void __ksm_exit(struct mm_struct *mm)
1559 mm_slot = get_mm_slot(mm); 1842 mm_slot = get_mm_slot(mm);
1560 if (mm_slot && ksm_scan.mm_slot != mm_slot) { 1843 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1561 if (!mm_slot->rmap_list) { 1844 if (!mm_slot->rmap_list) {
1562 hlist_del(&mm_slot->link); 1845 hash_del(&mm_slot->link);
1563 list_del(&mm_slot->mm_list); 1846 list_del(&mm_slot->mm_list);
1564 easy_to_free = 1; 1847 easy_to_free = 1;
1565 } else { 1848 } else {
@@ -1579,24 +1862,32 @@ void __ksm_exit(struct mm_struct *mm)
1579 } 1862 }
1580} 1863}
1581 1864
1582struct page *ksm_does_need_to_copy(struct page *page, 1865struct page *ksm_might_need_to_copy(struct page *page,
1583 struct vm_area_struct *vma, unsigned long address) 1866 struct vm_area_struct *vma, unsigned long address)
1584{ 1867{
1868 struct anon_vma *anon_vma = page_anon_vma(page);
1585 struct page *new_page; 1869 struct page *new_page;
1586 1870
1871 if (PageKsm(page)) {
1872 if (page_stable_node(page) &&
1873 !(ksm_run & KSM_RUN_UNMERGE))
1874 return page; /* no need to copy it */
1875 } else if (!anon_vma) {
1876 return page; /* no need to copy it */
1877 } else if (anon_vma->root == vma->anon_vma->root &&
1878 page->index == linear_page_index(vma, address)) {
1879 return page; /* still no need to copy it */
1880 }
1881 if (!PageUptodate(page))
1882 return page; /* let do_swap_page report the error */
1883
1587 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1884 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1588 if (new_page) { 1885 if (new_page) {
1589 copy_user_highpage(new_page, page, address, vma); 1886 copy_user_highpage(new_page, page, address, vma);
1590 1887
1591 SetPageDirty(new_page); 1888 SetPageDirty(new_page);
1592 __SetPageUptodate(new_page); 1889 __SetPageUptodate(new_page);
1593 SetPageSwapBacked(new_page);
1594 __set_page_locked(new_page); 1890 __set_page_locked(new_page);
1595
1596 if (!mlocked_vma_newpage(vma, new_page))
1597 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
1598 else
1599 add_page_to_unevictable_list(new_page);
1600 } 1891 }
1601 1892
1602 return new_page; 1893 return new_page;
@@ -1773,64 +2064,115 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage)
1773 if (stable_node) { 2064 if (stable_node) {
1774 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); 2065 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
1775 stable_node->kpfn = page_to_pfn(newpage); 2066 stable_node->kpfn = page_to_pfn(newpage);
2067 /*
2068 * newpage->mapping was set in advance; now we need smp_wmb()
2069 * to make sure that the new stable_node->kpfn is visible
2070 * to get_ksm_page() before it can see that oldpage->mapping
2071 * has gone stale (or that PageSwapCache has been cleared).
2072 */
2073 smp_wmb();
2074 set_page_stable_node(oldpage, NULL);
1776 } 2075 }
1777} 2076}
1778#endif /* CONFIG_MIGRATION */ 2077#endif /* CONFIG_MIGRATION */
1779 2078
1780#ifdef CONFIG_MEMORY_HOTREMOVE 2079#ifdef CONFIG_MEMORY_HOTREMOVE
1781static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, 2080static int just_wait(void *word)
1782 unsigned long end_pfn)
1783{ 2081{
1784 struct rb_node *node; 2082 schedule();
2083 return 0;
2084}
1785 2085
1786 for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { 2086static void wait_while_offlining(void)
1787 struct stable_node *stable_node; 2087{
2088 while (ksm_run & KSM_RUN_OFFLINE) {
2089 mutex_unlock(&ksm_thread_mutex);
2090 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2091 just_wait, TASK_UNINTERRUPTIBLE);
2092 mutex_lock(&ksm_thread_mutex);
2093 }
2094}
1788 2095
1789 stable_node = rb_entry(node, struct stable_node, node); 2096static void ksm_check_stable_tree(unsigned long start_pfn,
2097 unsigned long end_pfn)
2098{
2099 struct stable_node *stable_node;
2100 struct list_head *this, *next;
2101 struct rb_node *node;
2102 int nid;
2103
2104 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2105 node = rb_first(root_stable_tree + nid);
2106 while (node) {
2107 stable_node = rb_entry(node, struct stable_node, node);
2108 if (stable_node->kpfn >= start_pfn &&
2109 stable_node->kpfn < end_pfn) {
2110 /*
2111 * Don't get_ksm_page, page has already gone:
2112 * which is why we keep kpfn instead of page*
2113 */
2114 remove_node_from_stable_tree(stable_node);
2115 node = rb_first(root_stable_tree + nid);
2116 } else
2117 node = rb_next(node);
2118 cond_resched();
2119 }
2120 }
2121 list_for_each_safe(this, next, &migrate_nodes) {
2122 stable_node = list_entry(this, struct stable_node, list);
1790 if (stable_node->kpfn >= start_pfn && 2123 if (stable_node->kpfn >= start_pfn &&
1791 stable_node->kpfn < end_pfn) 2124 stable_node->kpfn < end_pfn)
1792 return stable_node; 2125 remove_node_from_stable_tree(stable_node);
2126 cond_resched();
1793 } 2127 }
1794 return NULL;
1795} 2128}
1796 2129
1797static int ksm_memory_callback(struct notifier_block *self, 2130static int ksm_memory_callback(struct notifier_block *self,
1798 unsigned long action, void *arg) 2131 unsigned long action, void *arg)
1799{ 2132{
1800 struct memory_notify *mn = arg; 2133 struct memory_notify *mn = arg;
1801 struct stable_node *stable_node;
1802 2134
1803 switch (action) { 2135 switch (action) {
1804 case MEM_GOING_OFFLINE: 2136 case MEM_GOING_OFFLINE:
1805 /* 2137 /*
1806 * Keep it very simple for now: just lock out ksmd and 2138 * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
1807 * MADV_UNMERGEABLE while any memory is going offline. 2139 * and remove_all_stable_nodes() while memory is going offline:
1808 * mutex_lock_nested() is necessary because lockdep was alarmed 2140 * it is unsafe for them to touch the stable tree at this time.
1809 * that here we take ksm_thread_mutex inside notifier chain 2141 * But unmerge_ksm_pages(), rmap lookups and other entry points
1810 * mutex, and later take notifier chain mutex inside 2142 * which do not need the ksm_thread_mutex are all safe.
1811 * ksm_thread_mutex to unlock it. But that's safe because both
1812 * are inside mem_hotplug_mutex.
1813 */ 2143 */
1814 mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); 2144 mutex_lock(&ksm_thread_mutex);
2145 ksm_run |= KSM_RUN_OFFLINE;
2146 mutex_unlock(&ksm_thread_mutex);
1815 break; 2147 break;
1816 2148
1817 case MEM_OFFLINE: 2149 case MEM_OFFLINE:
1818 /* 2150 /*
1819 * Most of the work is done by page migration; but there might 2151 * Most of the work is done by page migration; but there might
1820 * be a few stable_nodes left over, still pointing to struct 2152 * be a few stable_nodes left over, still pointing to struct
1821 * pages which have been offlined: prune those from the tree. 2153 * pages which have been offlined: prune those from the tree,
2154 * otherwise get_ksm_page() might later try to access a
2155 * non-existent struct page.
1822 */ 2156 */
1823 while ((stable_node = ksm_check_stable_tree(mn->start_pfn, 2157 ksm_check_stable_tree(mn->start_pfn,
1824 mn->start_pfn + mn->nr_pages)) != NULL) 2158 mn->start_pfn + mn->nr_pages);
1825 remove_node_from_stable_tree(stable_node);
1826 /* fallthrough */ 2159 /* fallthrough */
1827 2160
1828 case MEM_CANCEL_OFFLINE: 2161 case MEM_CANCEL_OFFLINE:
2162 mutex_lock(&ksm_thread_mutex);
2163 ksm_run &= ~KSM_RUN_OFFLINE;
1829 mutex_unlock(&ksm_thread_mutex); 2164 mutex_unlock(&ksm_thread_mutex);
2165
2166 smp_mb(); /* wake_up_bit advises this */
2167 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
1830 break; 2168 break;
1831 } 2169 }
1832 return NOTIFY_OK; 2170 return NOTIFY_OK;
1833} 2171}
2172#else
2173static void wait_while_offlining(void)
2174{
2175}
1834#endif /* CONFIG_MEMORY_HOTREMOVE */ 2176#endif /* CONFIG_MEMORY_HOTREMOVE */
1835 2177
1836#ifdef CONFIG_SYSFS 2178#ifdef CONFIG_SYSFS
@@ -1893,7 +2235,7 @@ KSM_ATTR(pages_to_scan);
1893static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, 2235static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
1894 char *buf) 2236 char *buf)
1895{ 2237{
1896 return sprintf(buf, "%u\n", ksm_run); 2238 return sprintf(buf, "%lu\n", ksm_run);
1897} 2239}
1898 2240
1899static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, 2241static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1916,6 +2258,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1916 */ 2258 */
1917 2259
1918 mutex_lock(&ksm_thread_mutex); 2260 mutex_lock(&ksm_thread_mutex);
2261 wait_while_offlining();
1919 if (ksm_run != flags) { 2262 if (ksm_run != flags) {
1920 ksm_run = flags; 2263 ksm_run = flags;
1921 if (flags & KSM_RUN_UNMERGE) { 2264 if (flags & KSM_RUN_UNMERGE) {
@@ -1937,6 +2280,64 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1937} 2280}
1938KSM_ATTR(run); 2281KSM_ATTR(run);
1939 2282
2283#ifdef CONFIG_NUMA
2284static ssize_t merge_across_nodes_show(struct kobject *kobj,
2285 struct kobj_attribute *attr, char *buf)
2286{
2287 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2288}
2289
2290static ssize_t merge_across_nodes_store(struct kobject *kobj,
2291 struct kobj_attribute *attr,
2292 const char *buf, size_t count)
2293{
2294 int err;
2295 unsigned long knob;
2296
2297 err = kstrtoul(buf, 10, &knob);
2298 if (err)
2299 return err;
2300 if (knob > 1)
2301 return -EINVAL;
2302
2303 mutex_lock(&ksm_thread_mutex);
2304 wait_while_offlining();
2305 if (ksm_merge_across_nodes != knob) {
2306 if (ksm_pages_shared || remove_all_stable_nodes())
2307 err = -EBUSY;
2308 else if (root_stable_tree == one_stable_tree) {
2309 struct rb_root *buf;
2310 /*
2311 * This is the first time that we switch away from the
2312 * default of merging across nodes: must now allocate
2313 * a buffer to hold as many roots as may be needed.
2314 * Allocate stable and unstable together:
2315 * MAXSMP NODES_SHIFT 10 will use 16kB.
2316 */
2317 buf = kcalloc(nr_node_ids + nr_node_ids,
2318 sizeof(*buf), GFP_KERNEL | __GFP_ZERO);
2319 /* Let us assume that RB_ROOT is NULL is zero */
2320 if (!buf)
2321 err = -ENOMEM;
2322 else {
2323 root_stable_tree = buf;
2324 root_unstable_tree = buf + nr_node_ids;
2325 /* Stable tree is empty but not the unstable */
2326 root_unstable_tree[0] = one_unstable_tree[0];
2327 }
2328 }
2329 if (!err) {
2330 ksm_merge_across_nodes = knob;
2331 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2332 }
2333 }
2334 mutex_unlock(&ksm_thread_mutex);
2335
2336 return err ? err : count;
2337}
2338KSM_ATTR(merge_across_nodes);
2339#endif
2340
1940static ssize_t pages_shared_show(struct kobject *kobj, 2341static ssize_t pages_shared_show(struct kobject *kobj,
1941 struct kobj_attribute *attr, char *buf) 2342 struct kobj_attribute *attr, char *buf)
1942{ 2343{
@@ -1991,6 +2392,9 @@ static struct attribute *ksm_attrs[] = {
1991 &pages_unshared_attr.attr, 2392 &pages_unshared_attr.attr,
1992 &pages_volatile_attr.attr, 2393 &pages_volatile_attr.attr,
1993 &full_scans_attr.attr, 2394 &full_scans_attr.attr,
2395#ifdef CONFIG_NUMA
2396 &merge_across_nodes_attr.attr,
2397#endif
1994 NULL, 2398 NULL,
1995}; 2399};
1996 2400
@@ -2029,10 +2433,7 @@ static int __init ksm_init(void)
2029#endif /* CONFIG_SYSFS */ 2433#endif /* CONFIG_SYSFS */
2030 2434
2031#ifdef CONFIG_MEMORY_HOTREMOVE 2435#ifdef CONFIG_MEMORY_HOTREMOVE
2032 /* 2436 /* There is no significance to this priority 100 */
2033 * Choose a high priority since the callback takes ksm_thread_mutex:
2034 * later callbacks could only be taking locks which nest within that.
2035 */
2036 hotplug_memory_notifier(ksm_memory_callback, 100); 2437 hotplug_memory_notifier(ksm_memory_callback, 100);
2037#endif 2438#endif
2038 return 0; 2439 return 0;