aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHugh Dickins <hugh.dickins@tiscali.co.uk>2009-12-14 20:59:33 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-15 11:53:20 -0500
commit62b61f611eb5e20f7e9f8619bfd03bdfe8af6348 (patch)
tree9f06fff7eb6530fbe90b4d998b91071133f6af25
parente9995ef978a7d5296fe04a9a2c5ca6e66d8bb4e5 (diff)
ksm: memory hotremove migration only
The previous patch enables page migration of ksm pages, but that soon gets into trouble: not surprising, since we're using the ksm page lock to lock operations on its stable_node, but page migration switches the page whose lock is to be used for that. Another layer of locking would fix it, but do we need that yet? Do we actually need page migration of ksm pages? Yes, memory hotremove needs to offline sections of memory: and since we stopped allocating ksm pages with GFP_HIGHUSER, they will tend to be GFP_HIGHUSER_MOVABLE candidates for migration. But KSM is currently unconscious of NUMA issues, happily merging pages from different NUMA nodes: at present the rule must be, not to use MADV_MERGEABLE where you care about NUMA. So no, NUMA page migration of ksm pages does not make sense yet. So, to complete support for ksm swapping we need to make hotremove safe. ksm_memory_callback() take ksm_thread_mutex when MEM_GOING_OFFLINE and release it when MEM_OFFLINE or MEM_CANCEL_OFFLINE. But if mapped pages are freed before migration reaches them, stable_nodes may be left still pointing to struct pages which have been removed from the system: the stable_node needs to identify a page by pfn rather than page pointer, then it can safely prune them when MEM_OFFLINE. And make NUMA migration skip PageKsm pages where it skips PageReserved. But it's only when we reach unmap_and_move() that the page lock is taken and we can be sure that raised pagecount has prevented a PageAnon from being upgraded: so add offlining arg to migrate_pages(), to migrate ksm page when offlining (has sufficient locking) but reject it otherwise. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Izik Eidus <ieidus@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Chris Wright <chrisw@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/migrate.h8
-rw-r--r--mm/ksm.c84
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c19
-rw-r--r--mm/migrate.c27
5 files changed, 103 insertions, 37 deletions
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 527602cdea1c..7f085c97c799 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -12,7 +12,8 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **);
12extern int putback_lru_pages(struct list_head *l); 12extern int putback_lru_pages(struct list_head *l);
13extern int migrate_page(struct address_space *, 13extern int migrate_page(struct address_space *,
14 struct page *, struct page *); 14 struct page *, struct page *);
15extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long); 15extern int migrate_pages(struct list_head *l, new_page_t x,
16 unsigned long private, int offlining);
16 17
17extern int fail_migrate_page(struct address_space *, 18extern int fail_migrate_page(struct address_space *,
18 struct page *, struct page *); 19 struct page *, struct page *);
@@ -26,10 +27,7 @@ extern int migrate_vmas(struct mm_struct *mm,
26 27
27static inline int putback_lru_pages(struct list_head *l) { return 0; } 28static inline int putback_lru_pages(struct list_head *l) { return 0; }
28static inline int migrate_pages(struct list_head *l, new_page_t x, 29static inline int migrate_pages(struct list_head *l, new_page_t x,
29 unsigned long private) { return -ENOSYS; } 30 unsigned long private, int offlining) { return -ENOSYS; }
30
31static inline int migrate_pages_to(struct list_head *pagelist,
32 struct vm_area_struct *vma, int dest) { return 0; }
33 31
34static inline int migrate_prep(void) { return -ENOSYS; } 32static inline int migrate_prep(void) { return -ENOSYS; }
35 33
diff --git a/mm/ksm.c b/mm/ksm.c
index dfdc292d3626..d4c228a9d278 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -29,6 +29,7 @@
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/rbtree.h> 31#include <linux/rbtree.h>
32#include <linux/memory.h>
32#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
33#include <linux/swap.h> 34#include <linux/swap.h>
34#include <linux/ksm.h> 35#include <linux/ksm.h>
@@ -108,14 +109,14 @@ struct ksm_scan {
108 109
109/** 110/**
110 * struct stable_node - node of the stable rbtree 111 * struct stable_node - node of the stable rbtree
111 * @page: pointer to struct page of the ksm page
112 * @node: rb node of this ksm page in the stable tree 112 * @node: rb node of this ksm page in the stable tree
113 * @hlist: hlist head of rmap_items using this ksm page 113 * @hlist: hlist head of rmap_items using this ksm page
114 * @kpfn: page frame number of this ksm page
114 */ 115 */
115struct stable_node { 116struct stable_node {
116 struct page *page;
117 struct rb_node node; 117 struct rb_node node;
118 struct hlist_head hlist; 118 struct hlist_head hlist;
119 unsigned long kpfn;
119}; 120};
120 121
121/** 122/**
@@ -515,7 +516,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node)
515 struct page *page; 516 struct page *page;
516 void *expected_mapping; 517 void *expected_mapping;
517 518
518 page = stable_node->page; 519 page = pfn_to_page(stable_node->kpfn);
519 expected_mapping = (void *)stable_node + 520 expected_mapping = (void *)stable_node +
520 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); 521 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
521 rcu_read_lock(); 522 rcu_read_lock();
@@ -973,7 +974,7 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
973 * This function returns the stable tree node of identical content if found, 974 * This function returns the stable tree node of identical content if found,
974 * NULL otherwise. 975 * NULL otherwise.
975 */ 976 */
976static struct stable_node *stable_tree_search(struct page *page) 977static struct page *stable_tree_search(struct page *page)
977{ 978{
978 struct rb_node *node = root_stable_tree.rb_node; 979 struct rb_node *node = root_stable_tree.rb_node;
979 struct stable_node *stable_node; 980 struct stable_node *stable_node;
@@ -981,7 +982,7 @@ static struct stable_node *stable_tree_search(struct page *page)
981 stable_node = page_stable_node(page); 982 stable_node = page_stable_node(page);
982 if (stable_node) { /* ksm page forked */ 983 if (stable_node) { /* ksm page forked */
983 get_page(page); 984 get_page(page);
984 return stable_node; 985 return page;
985 } 986 }
986 987
987 while (node) { 988 while (node) {
@@ -1003,7 +1004,7 @@ static struct stable_node *stable_tree_search(struct page *page)
1003 put_page(tree_page); 1004 put_page(tree_page);
1004 node = node->rb_right; 1005 node = node->rb_right;
1005 } else 1006 } else
1006 return stable_node; 1007 return tree_page;
1007 } 1008 }
1008 1009
1009 return NULL; 1010 return NULL;
@@ -1059,7 +1060,7 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
1059 1060
1060 INIT_HLIST_HEAD(&stable_node->hlist); 1061 INIT_HLIST_HEAD(&stable_node->hlist);
1061 1062
1062 stable_node->page = kpage; 1063 stable_node->kpfn = page_to_pfn(kpage);
1063 set_page_stable_node(kpage, stable_node); 1064 set_page_stable_node(kpage, stable_node);
1064 1065
1065 return stable_node; 1066 return stable_node;
@@ -1170,9 +1171,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1170 remove_rmap_item_from_tree(rmap_item); 1171 remove_rmap_item_from_tree(rmap_item);
1171 1172
1172 /* We first start with searching the page inside the stable tree */ 1173 /* We first start with searching the page inside the stable tree */
1173 stable_node = stable_tree_search(page); 1174 kpage = stable_tree_search(page);
1174 if (stable_node) { 1175 if (kpage) {
1175 kpage = stable_node->page;
1176 err = try_to_merge_with_ksm_page(rmap_item, page, kpage); 1176 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1177 if (!err) { 1177 if (!err) {
1178 /* 1178 /*
@@ -1180,7 +1180,7 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1180 * add its rmap_item to the stable tree. 1180 * add its rmap_item to the stable tree.
1181 */ 1181 */
1182 lock_page(kpage); 1182 lock_page(kpage);
1183 stable_tree_append(rmap_item, stable_node); 1183 stable_tree_append(rmap_item, page_stable_node(kpage));
1184 unlock_page(kpage); 1184 unlock_page(kpage);
1185 } 1185 }
1186 put_page(kpage); 1186 put_page(kpage);
@@ -1715,12 +1715,63 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage)
1715 1715
1716 stable_node = page_stable_node(newpage); 1716 stable_node = page_stable_node(newpage);
1717 if (stable_node) { 1717 if (stable_node) {
1718 VM_BUG_ON(stable_node->page != oldpage); 1718 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
1719 stable_node->page = newpage; 1719 stable_node->kpfn = page_to_pfn(newpage);
1720 } 1720 }
1721} 1721}
1722#endif /* CONFIG_MIGRATION */ 1722#endif /* CONFIG_MIGRATION */
1723 1723
1724#ifdef CONFIG_MEMORY_HOTREMOVE
1725static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn,
1726 unsigned long end_pfn)
1727{
1728 struct rb_node *node;
1729
1730 for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
1731 struct stable_node *stable_node;
1732
1733 stable_node = rb_entry(node, struct stable_node, node);
1734 if (stable_node->kpfn >= start_pfn &&
1735 stable_node->kpfn < end_pfn)
1736 return stable_node;
1737 }
1738 return NULL;
1739}
1740
1741static int ksm_memory_callback(struct notifier_block *self,
1742 unsigned long action, void *arg)
1743{
1744 struct memory_notify *mn = arg;
1745 struct stable_node *stable_node;
1746
1747 switch (action) {
1748 case MEM_GOING_OFFLINE:
1749 /*
1750 * Keep it very simple for now: just lock out ksmd and
1751 * MADV_UNMERGEABLE while any memory is going offline.
1752 */
1753 mutex_lock(&ksm_thread_mutex);
1754 break;
1755
1756 case MEM_OFFLINE:
1757 /*
1758 * Most of the work is done by page migration; but there might
1759 * be a few stable_nodes left over, still pointing to struct
1760 * pages which have been offlined: prune those from the tree.
1761 */
1762 while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
1763 mn->start_pfn + mn->nr_pages)) != NULL)
1764 remove_node_from_stable_tree(stable_node);
1765 /* fallthrough */
1766
1767 case MEM_CANCEL_OFFLINE:
1768 mutex_unlock(&ksm_thread_mutex);
1769 break;
1770 }
1771 return NOTIFY_OK;
1772}
1773#endif /* CONFIG_MEMORY_HOTREMOVE */
1774
1724#ifdef CONFIG_SYSFS 1775#ifdef CONFIG_SYSFS
1725/* 1776/*
1726 * This all compiles without CONFIG_SYSFS, but is a waste of space. 1777 * This all compiles without CONFIG_SYSFS, but is a waste of space.
@@ -1946,6 +1997,13 @@ static int __init ksm_init(void)
1946 1997
1947#endif /* CONFIG_SYSFS */ 1998#endif /* CONFIG_SYSFS */
1948 1999
2000#ifdef CONFIG_MEMORY_HOTREMOVE
2001 /*
2002 * Choose a high priority since the callback takes ksm_thread_mutex:
2003 * later callbacks could only be taking locks which nest within that.
2004 */
2005 hotplug_memory_notifier(ksm_memory_callback, 100);
2006#endif
1949 return 0; 2007 return 0;
1950 2008
1951out_free2: 2009out_free2:
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index bc5a08138f1e..67e941d7882c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -698,7 +698,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
698 if (list_empty(&source)) 698 if (list_empty(&source))
699 goto out; 699 goto out;
700 /* this function returns # of failed pages */ 700 /* this function returns # of failed pages */
701 ret = migrate_pages(&source, hotremove_migrate_alloc, 0); 701 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
702 702
703out: 703out:
704 return ret; 704 return ret;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f11fdad06204..290fb5bf0440 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -85,6 +85,7 @@
85#include <linux/seq_file.h> 85#include <linux/seq_file.h>
86#include <linux/proc_fs.h> 86#include <linux/proc_fs.h>
87#include <linux/migrate.h> 87#include <linux/migrate.h>
88#include <linux/ksm.h>
88#include <linux/rmap.h> 89#include <linux/rmap.h>
89#include <linux/security.h> 90#include <linux/security.h>
90#include <linux/syscalls.h> 91#include <linux/syscalls.h>
@@ -413,17 +414,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
413 if (!page) 414 if (!page)
414 continue; 415 continue;
415 /* 416 /*
416 * The check for PageReserved here is important to avoid 417 * vm_normal_page() filters out zero pages, but there might
417 * handling zero pages and other pages that may have been 418 * still be PageReserved pages to skip, perhaps in a VDSO.
418 * marked special by the system. 419 * And we cannot move PageKsm pages sensibly or safely yet.
419 *
420 * If the PageReserved would not be checked here then f.e.
421 * the location of the zero page could have an influence
422 * on MPOL_MF_STRICT, zero pages would be counted for
423 * the per node stats, and there would be useless attempts
424 * to put zero pages on the migration list.
425 */ 420 */
426 if (PageReserved(page)) 421 if (PageReserved(page) || PageKsm(page))
427 continue; 422 continue;
428 nid = page_to_nid(page); 423 nid = page_to_nid(page);
429 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 424 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -839,7 +834,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
839 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 834 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
840 835
841 if (!list_empty(&pagelist)) 836 if (!list_empty(&pagelist))
842 err = migrate_pages(&pagelist, new_node_page, dest); 837 err = migrate_pages(&pagelist, new_node_page, dest, 0);
843 838
844 return err; 839 return err;
845} 840}
@@ -1056,7 +1051,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1056 1051
1057 if (!list_empty(&pagelist)) 1052 if (!list_empty(&pagelist))
1058 nr_failed = migrate_pages(&pagelist, new_vma_page, 1053 nr_failed = migrate_pages(&pagelist, new_vma_page,
1059 (unsigned long)vma); 1054 (unsigned long)vma, 0);
1060 1055
1061 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1056 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1062 err = -EIO; 1057 err = -EIO;
diff --git a/mm/migrate.c b/mm/migrate.c
index 0b714747c028..2a0ea3ef509e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -543,7 +543,7 @@ static int move_to_new_page(struct page *newpage, struct page *page)
543 * to the newly allocated page in newpage. 543 * to the newly allocated page in newpage.
544 */ 544 */
545static int unmap_and_move(new_page_t get_new_page, unsigned long private, 545static int unmap_and_move(new_page_t get_new_page, unsigned long private,
546 struct page *page, int force) 546 struct page *page, int force, int offlining)
547{ 547{
548 int rc = 0; 548 int rc = 0;
549 int *result = NULL; 549 int *result = NULL;
@@ -569,6 +569,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
569 lock_page(page); 569 lock_page(page);
570 } 570 }
571 571
572 /*
573 * Only memory hotplug's offline_pages() caller has locked out KSM,
574 * and can safely migrate a KSM page. The other cases have skipped
575 * PageKsm along with PageReserved - but it is only now when we have
576 * the page lock that we can be certain it will not go KSM beneath us
577 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
578 * its pagecount raised, but only here do we take the page lock which
579 * serializes that).
580 */
581 if (PageKsm(page) && !offlining) {
582 rc = -EBUSY;
583 goto unlock;
584 }
585
572 /* charge against new page */ 586 /* charge against new page */
573 charge = mem_cgroup_prepare_migration(page, &mem); 587 charge = mem_cgroup_prepare_migration(page, &mem);
574 if (charge == -ENOMEM) { 588 if (charge == -ENOMEM) {
@@ -685,7 +699,7 @@ move_newpage:
685 * Return: Number of pages not migrated or error code. 699 * Return: Number of pages not migrated or error code.
686 */ 700 */
687int migrate_pages(struct list_head *from, 701int migrate_pages(struct list_head *from,
688 new_page_t get_new_page, unsigned long private) 702 new_page_t get_new_page, unsigned long private, int offlining)
689{ 703{
690 int retry = 1; 704 int retry = 1;
691 int nr_failed = 0; 705 int nr_failed = 0;
@@ -705,7 +719,7 @@ int migrate_pages(struct list_head *from,
705 cond_resched(); 719 cond_resched();
706 720
707 rc = unmap_and_move(get_new_page, private, 721 rc = unmap_and_move(get_new_page, private,
708 page, pass > 2); 722 page, pass > 2, offlining);
709 723
710 switch(rc) { 724 switch(rc) {
711 case -ENOMEM: 725 case -ENOMEM:
@@ -801,7 +815,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
801 if (!page) 815 if (!page)
802 goto set_status; 816 goto set_status;
803 817
804 if (PageReserved(page)) /* Check for zero page */ 818 /* Use PageReserved to check for zero page */
819 if (PageReserved(page) || PageKsm(page))
805 goto put_and_set; 820 goto put_and_set;
806 821
807 pp->page = page; 822 pp->page = page;
@@ -838,7 +853,7 @@ set_status:
838 err = 0; 853 err = 0;
839 if (!list_empty(&pagelist)) 854 if (!list_empty(&pagelist))
840 err = migrate_pages(&pagelist, new_page_node, 855 err = migrate_pages(&pagelist, new_page_node,
841 (unsigned long)pm); 856 (unsigned long)pm, 0);
842 857
843 up_read(&mm->mmap_sem); 858 up_read(&mm->mmap_sem);
844 return err; 859 return err;
@@ -959,7 +974,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
959 974
960 err = -ENOENT; 975 err = -ENOENT;
961 /* Use PageReserved to check for zero page */ 976 /* Use PageReserved to check for zero page */
962 if (!page || PageReserved(page)) 977 if (!page || PageReserved(page) || PageKsm(page))
963 goto set_status; 978 goto set_status;
964 979
965 err = page_to_nid(page); 980 err = page_to_nid(page);