diff options
author | Hugh Dickins <hugh.dickins@tiscali.co.uk> | 2009-12-14 20:59:33 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-15 11:53:20 -0500 |
commit | 62b61f611eb5e20f7e9f8619bfd03bdfe8af6348 (patch) | |
tree | 9f06fff7eb6530fbe90b4d998b91071133f6af25 | |
parent | e9995ef978a7d5296fe04a9a2c5ca6e66d8bb4e5 (diff) |
ksm: memory hotremove migration only
The previous patch enables page migration of ksm pages, but that soon gets
into trouble: not surprising, since we're using the ksm page lock to lock
operations on its stable_node, but page migration switches the page whose
lock is to be used for that. Another layer of locking would fix it, but
do we need that yet?
Do we actually need page migration of ksm pages? Yes, memory hotremove
needs to offline sections of memory: and since we stopped allocating ksm
pages with GFP_HIGHUSER, they will tend to be GFP_HIGHUSER_MOVABLE
candidates for migration.
But KSM is currently unconscious of NUMA issues, happily merging pages
from different NUMA nodes: at present the rule must be, not to use
MADV_MERGEABLE where you care about NUMA. So no, NUMA page migration of
ksm pages does not make sense yet.
So, to complete support for ksm swapping we need to make hotremove safe.
ksm_memory_callback() take ksm_thread_mutex when MEM_GOING_OFFLINE and
release it when MEM_OFFLINE or MEM_CANCEL_OFFLINE. But if mapped pages
are freed before migration reaches them, stable_nodes may be left still
pointing to struct pages which have been removed from the system: the
stable_node needs to identify a page by pfn rather than page pointer, then
it can safely prune them when MEM_OFFLINE.
And make NUMA migration skip PageKsm pages where it skips PageReserved.
But it's only when we reach unmap_and_move() that the page lock is taken
and we can be sure that raised pagecount has prevented a PageAnon from
being upgraded: so add offlining arg to migrate_pages(), to migrate ksm
page when offlining (has sufficient locking) but reject it otherwise.
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Izik Eidus <ieidus@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Chris Wright <chrisw@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/migrate.h | 8 | ||||
-rw-r--r-- | mm/ksm.c | 84 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 19 | ||||
-rw-r--r-- | mm/migrate.c | 27 |
5 files changed, 103 insertions, 37 deletions
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 527602cdea1..7f085c97c79 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -12,7 +12,8 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); | |||
12 | extern int putback_lru_pages(struct list_head *l); | 12 | extern int putback_lru_pages(struct list_head *l); |
13 | extern int migrate_page(struct address_space *, | 13 | extern int migrate_page(struct address_space *, |
14 | struct page *, struct page *); | 14 | struct page *, struct page *); |
15 | extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long); | 15 | extern int migrate_pages(struct list_head *l, new_page_t x, |
16 | unsigned long private, int offlining); | ||
16 | 17 | ||
17 | extern int fail_migrate_page(struct address_space *, | 18 | extern int fail_migrate_page(struct address_space *, |
18 | struct page *, struct page *); | 19 | struct page *, struct page *); |
@@ -26,10 +27,7 @@ extern int migrate_vmas(struct mm_struct *mm, | |||
26 | 27 | ||
27 | static inline int putback_lru_pages(struct list_head *l) { return 0; } | 28 | static inline int putback_lru_pages(struct list_head *l) { return 0; } |
28 | static inline int migrate_pages(struct list_head *l, new_page_t x, | 29 | static inline int migrate_pages(struct list_head *l, new_page_t x, |
29 | unsigned long private) { return -ENOSYS; } | 30 | unsigned long private, int offlining) { return -ENOSYS; } |
30 | |||
31 | static inline int migrate_pages_to(struct list_head *pagelist, | ||
32 | struct vm_area_struct *vma, int dest) { return 0; } | ||
33 | 31 | ||
34 | static inline int migrate_prep(void) { return -ENOSYS; } | 32 | static inline int migrate_prep(void) { return -ENOSYS; } |
35 | 33 | ||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/wait.h> | 29 | #include <linux/wait.h> |
30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
31 | #include <linux/rbtree.h> | 31 | #include <linux/rbtree.h> |
32 | #include <linux/memory.h> | ||
32 | #include <linux/mmu_notifier.h> | 33 | #include <linux/mmu_notifier.h> |
33 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
34 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
@@ -108,14 +109,14 @@ struct ksm_scan { | |||
108 | 109 | ||
109 | /** | 110 | /** |
110 | * struct stable_node - node of the stable rbtree | 111 | * struct stable_node - node of the stable rbtree |
111 | * @page: pointer to struct page of the ksm page | ||
112 | * @node: rb node of this ksm page in the stable tree | 112 | * @node: rb node of this ksm page in the stable tree |
113 | * @hlist: hlist head of rmap_items using this ksm page | 113 | * @hlist: hlist head of rmap_items using this ksm page |
114 | * @kpfn: page frame number of this ksm page | ||
114 | */ | 115 | */ |
115 | struct stable_node { | 116 | struct stable_node { |
116 | struct page *page; | ||
117 | struct rb_node node; | 117 | struct rb_node node; |
118 | struct hlist_head hlist; | 118 | struct hlist_head hlist; |
119 | unsigned long kpfn; | ||
119 | }; | 120 | }; |
120 | 121 | ||
121 | /** | 122 | /** |
@@ -515,7 +516,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node) | |||
515 | struct page *page; | 516 | struct page *page; |
516 | void *expected_mapping; | 517 | void *expected_mapping; |
517 | 518 | ||
518 | page = stable_node->page; | 519 | page = pfn_to_page(stable_node->kpfn); |
519 | expected_mapping = (void *)stable_node + | 520 | expected_mapping = (void *)stable_node + |
520 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | 521 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); |
521 | rcu_read_lock(); | 522 | rcu_read_lock(); |
@@ -973,7 +974,7 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, | |||
973 | * This function returns the stable tree node of identical content if found, | 974 | * This function returns the stable tree node of identical content if found, |
974 | * NULL otherwise. | 975 | * NULL otherwise. |
975 | */ | 976 | */ |
976 | static struct stable_node *stable_tree_search(struct page *page) | 977 | static struct page *stable_tree_search(struct page *page) |
977 | { | 978 | { |
978 | struct rb_node *node = root_stable_tree.rb_node; | 979 | struct rb_node *node = root_stable_tree.rb_node; |
979 | struct stable_node *stable_node; | 980 | struct stable_node *stable_node; |
@@ -981,7 +982,7 @@ static struct stable_node *stable_tree_search(struct page *page) | |||
981 | stable_node = page_stable_node(page); | 982 | stable_node = page_stable_node(page); |
982 | if (stable_node) { /* ksm page forked */ | 983 | if (stable_node) { /* ksm page forked */ |
983 | get_page(page); | 984 | get_page(page); |
984 | return stable_node; | 985 | return page; |
985 | } | 986 | } |
986 | 987 | ||
987 | while (node) { | 988 | while (node) { |
@@ -1003,7 +1004,7 @@ static struct stable_node *stable_tree_search(struct page *page) | |||
1003 | put_page(tree_page); | 1004 | put_page(tree_page); |
1004 | node = node->rb_right; | 1005 | node = node->rb_right; |
1005 | } else | 1006 | } else |
1006 | return stable_node; | 1007 | return tree_page; |
1007 | } | 1008 | } |
1008 | 1009 | ||
1009 | return NULL; | 1010 | return NULL; |
@@ -1059,7 +1060,7 @@ static struct stable_node *stable_tree_insert(struct page *kpage) | |||
1059 | 1060 | ||
1060 | INIT_HLIST_HEAD(&stable_node->hlist); | 1061 | INIT_HLIST_HEAD(&stable_node->hlist); |
1061 | 1062 | ||
1062 | stable_node->page = kpage; | 1063 | stable_node->kpfn = page_to_pfn(kpage); |
1063 | set_page_stable_node(kpage, stable_node); | 1064 | set_page_stable_node(kpage, stable_node); |
1064 | 1065 | ||
1065 | return stable_node; | 1066 | return stable_node; |
@@ -1170,9 +1171,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1170 | remove_rmap_item_from_tree(rmap_item); | 1171 | remove_rmap_item_from_tree(rmap_item); |
1171 | 1172 | ||
1172 | /* We first start with searching the page inside the stable tree */ | 1173 | /* We first start with searching the page inside the stable tree */ |
1173 | stable_node = stable_tree_search(page); | 1174 | kpage = stable_tree_search(page); |
1174 | if (stable_node) { | 1175 | if (kpage) { |
1175 | kpage = stable_node->page; | ||
1176 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); | 1176 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
1177 | if (!err) { | 1177 | if (!err) { |
1178 | /* | 1178 | /* |
@@ -1180,7 +1180,7 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1180 | * add its rmap_item to the stable tree. | 1180 | * add its rmap_item to the stable tree. |
1181 | */ | 1181 | */ |
1182 | lock_page(kpage); | 1182 | lock_page(kpage); |
1183 | stable_tree_append(rmap_item, stable_node); | 1183 | stable_tree_append(rmap_item, page_stable_node(kpage)); |
1184 | unlock_page(kpage); | 1184 | unlock_page(kpage); |
1185 | } | 1185 | } |
1186 | put_page(kpage); | 1186 | put_page(kpage); |
@@ -1715,12 +1715,63 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage) | |||
1715 | 1715 | ||
1716 | stable_node = page_stable_node(newpage); | 1716 | stable_node = page_stable_node(newpage); |
1717 | if (stable_node) { | 1717 | if (stable_node) { |
1718 | VM_BUG_ON(stable_node->page != oldpage); | 1718 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); |
1719 | stable_node->page = newpage; | 1719 | stable_node->kpfn = page_to_pfn(newpage); |
1720 | } | 1720 | } |
1721 | } | 1721 | } |
1722 | #endif /* CONFIG_MIGRATION */ | 1722 | #endif /* CONFIG_MIGRATION */ |
1723 | 1723 | ||
1724 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
1725 | static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, | ||
1726 | unsigned long end_pfn) | ||
1727 | { | ||
1728 | struct rb_node *node; | ||
1729 | |||
1730 | for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { | ||
1731 | struct stable_node *stable_node; | ||
1732 | |||
1733 | stable_node = rb_entry(node, struct stable_node, node); | ||
1734 | if (stable_node->kpfn >= start_pfn && | ||
1735 | stable_node->kpfn < end_pfn) | ||
1736 | return stable_node; | ||
1737 | } | ||
1738 | return NULL; | ||
1739 | } | ||
1740 | |||
1741 | static int ksm_memory_callback(struct notifier_block *self, | ||
1742 | unsigned long action, void *arg) | ||
1743 | { | ||
1744 | struct memory_notify *mn = arg; | ||
1745 | struct stable_node *stable_node; | ||
1746 | |||
1747 | switch (action) { | ||
1748 | case MEM_GOING_OFFLINE: | ||
1749 | /* | ||
1750 | * Keep it very simple for now: just lock out ksmd and | ||
1751 | * MADV_UNMERGEABLE while any memory is going offline. | ||
1752 | */ | ||
1753 | mutex_lock(&ksm_thread_mutex); | ||
1754 | break; | ||
1755 | |||
1756 | case MEM_OFFLINE: | ||
1757 | /* | ||
1758 | * Most of the work is done by page migration; but there might | ||
1759 | * be a few stable_nodes left over, still pointing to struct | ||
1760 | * pages which have been offlined: prune those from the tree. | ||
1761 | */ | ||
1762 | while ((stable_node = ksm_check_stable_tree(mn->start_pfn, | ||
1763 | mn->start_pfn + mn->nr_pages)) != NULL) | ||
1764 | remove_node_from_stable_tree(stable_node); | ||
1765 | /* fallthrough */ | ||
1766 | |||
1767 | case MEM_CANCEL_OFFLINE: | ||
1768 | mutex_unlock(&ksm_thread_mutex); | ||
1769 | break; | ||
1770 | } | ||
1771 | return NOTIFY_OK; | ||
1772 | } | ||
1773 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
1774 | |||
1724 | #ifdef CONFIG_SYSFS | 1775 | #ifdef CONFIG_SYSFS |
1725 | /* | 1776 | /* |
1726 | * This all compiles without CONFIG_SYSFS, but is a waste of space. | 1777 | * This all compiles without CONFIG_SYSFS, but is a waste of space. |
@@ -1946,6 +1997,13 @@ static int __init ksm_init(void) | |||
1946 | 1997 | ||
1947 | #endif /* CONFIG_SYSFS */ | 1998 | #endif /* CONFIG_SYSFS */ |
1948 | 1999 | ||
2000 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
2001 | /* | ||
2002 | * Choose a high priority since the callback takes ksm_thread_mutex: | ||
2003 | * later callbacks could only be taking locks which nest within that. | ||
2004 | */ | ||
2005 | hotplug_memory_notifier(ksm_memory_callback, 100); | ||
2006 | #endif | ||
1949 | return 0; | 2007 | return 0; |
1950 | 2008 | ||
1951 | out_free2: | 2009 | out_free2: |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index bc5a08138f1..67e941d7882 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -698,7 +698,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
698 | if (list_empty(&source)) | 698 | if (list_empty(&source)) |
699 | goto out; | 699 | goto out; |
700 | /* this function returns # of failed pages */ | 700 | /* this function returns # of failed pages */ |
701 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0); | 701 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); |
702 | 702 | ||
703 | out: | 703 | out: |
704 | return ret; | 704 | return ret; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f11fdad0620..290fb5bf044 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -85,6 +85,7 @@ | |||
85 | #include <linux/seq_file.h> | 85 | #include <linux/seq_file.h> |
86 | #include <linux/proc_fs.h> | 86 | #include <linux/proc_fs.h> |
87 | #include <linux/migrate.h> | 87 | #include <linux/migrate.h> |
88 | #include <linux/ksm.h> | ||
88 | #include <linux/rmap.h> | 89 | #include <linux/rmap.h> |
89 | #include <linux/security.h> | 90 | #include <linux/security.h> |
90 | #include <linux/syscalls.h> | 91 | #include <linux/syscalls.h> |
@@ -413,17 +414,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
413 | if (!page) | 414 | if (!page) |
414 | continue; | 415 | continue; |
415 | /* | 416 | /* |
416 | * The check for PageReserved here is important to avoid | 417 | * vm_normal_page() filters out zero pages, but there might |
417 | * handling zero pages and other pages that may have been | 418 | * still be PageReserved pages to skip, perhaps in a VDSO. |
418 | * marked special by the system. | 419 | * And we cannot move PageKsm pages sensibly or safely yet. |
419 | * | ||
420 | * If the PageReserved would not be checked here then f.e. | ||
421 | * the location of the zero page could have an influence | ||
422 | * on MPOL_MF_STRICT, zero pages would be counted for | ||
423 | * the per node stats, and there would be useless attempts | ||
424 | * to put zero pages on the migration list. | ||
425 | */ | 420 | */ |
426 | if (PageReserved(page)) | 421 | if (PageReserved(page) || PageKsm(page)) |
427 | continue; | 422 | continue; |
428 | nid = page_to_nid(page); | 423 | nid = page_to_nid(page); |
429 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 424 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
@@ -839,7 +834,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
839 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 834 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
840 | 835 | ||
841 | if (!list_empty(&pagelist)) | 836 | if (!list_empty(&pagelist)) |
842 | err = migrate_pages(&pagelist, new_node_page, dest); | 837 | err = migrate_pages(&pagelist, new_node_page, dest, 0); |
843 | 838 | ||
844 | return err; | 839 | return err; |
845 | } | 840 | } |
@@ -1056,7 +1051,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1056 | 1051 | ||
1057 | if (!list_empty(&pagelist)) | 1052 | if (!list_empty(&pagelist)) |
1058 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1053 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1059 | (unsigned long)vma); | 1054 | (unsigned long)vma, 0); |
1060 | 1055 | ||
1061 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 1056 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) |
1062 | err = -EIO; | 1057 | err = -EIO; |
diff --git a/mm/migrate.c b/mm/migrate.c index 0b714747c02..2a0ea3ef509 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -543,7 +543,7 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
543 | * to the newly allocated page in newpage. | 543 | * to the newly allocated page in newpage. |
544 | */ | 544 | */ |
545 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 545 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, |
546 | struct page *page, int force) | 546 | struct page *page, int force, int offlining) |
547 | { | 547 | { |
548 | int rc = 0; | 548 | int rc = 0; |
549 | int *result = NULL; | 549 | int *result = NULL; |
@@ -569,6 +569,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
569 | lock_page(page); | 569 | lock_page(page); |
570 | } | 570 | } |
571 | 571 | ||
572 | /* | ||
573 | * Only memory hotplug's offline_pages() caller has locked out KSM, | ||
574 | * and can safely migrate a KSM page. The other cases have skipped | ||
575 | * PageKsm along with PageReserved - but it is only now when we have | ||
576 | * the page lock that we can be certain it will not go KSM beneath us | ||
577 | * (KSM will not upgrade a page from PageAnon to PageKsm when it sees | ||
578 | * its pagecount raised, but only here do we take the page lock which | ||
579 | * serializes that). | ||
580 | */ | ||
581 | if (PageKsm(page) && !offlining) { | ||
582 | rc = -EBUSY; | ||
583 | goto unlock; | ||
584 | } | ||
585 | |||
572 | /* charge against new page */ | 586 | /* charge against new page */ |
573 | charge = mem_cgroup_prepare_migration(page, &mem); | 587 | charge = mem_cgroup_prepare_migration(page, &mem); |
574 | if (charge == -ENOMEM) { | 588 | if (charge == -ENOMEM) { |
@@ -685,7 +699,7 @@ move_newpage: | |||
685 | * Return: Number of pages not migrated or error code. | 699 | * Return: Number of pages not migrated or error code. |
686 | */ | 700 | */ |
687 | int migrate_pages(struct list_head *from, | 701 | int migrate_pages(struct list_head *from, |
688 | new_page_t get_new_page, unsigned long private) | 702 | new_page_t get_new_page, unsigned long private, int offlining) |
689 | { | 703 | { |
690 | int retry = 1; | 704 | int retry = 1; |
691 | int nr_failed = 0; | 705 | int nr_failed = 0; |
@@ -705,7 +719,7 @@ int migrate_pages(struct list_head *from, | |||
705 | cond_resched(); | 719 | cond_resched(); |
706 | 720 | ||
707 | rc = unmap_and_move(get_new_page, private, | 721 | rc = unmap_and_move(get_new_page, private, |
708 | page, pass > 2); | 722 | page, pass > 2, offlining); |
709 | 723 | ||
710 | switch(rc) { | 724 | switch(rc) { |
711 | case -ENOMEM: | 725 | case -ENOMEM: |
@@ -801,7 +815,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
801 | if (!page) | 815 | if (!page) |
802 | goto set_status; | 816 | goto set_status; |
803 | 817 | ||
804 | if (PageReserved(page)) /* Check for zero page */ | 818 | /* Use PageReserved to check for zero page */ |
819 | if (PageReserved(page) || PageKsm(page)) | ||
805 | goto put_and_set; | 820 | goto put_and_set; |
806 | 821 | ||
807 | pp->page = page; | 822 | pp->page = page; |
@@ -838,7 +853,7 @@ set_status: | |||
838 | err = 0; | 853 | err = 0; |
839 | if (!list_empty(&pagelist)) | 854 | if (!list_empty(&pagelist)) |
840 | err = migrate_pages(&pagelist, new_page_node, | 855 | err = migrate_pages(&pagelist, new_page_node, |
841 | (unsigned long)pm); | 856 | (unsigned long)pm, 0); |
842 | 857 | ||
843 | up_read(&mm->mmap_sem); | 858 | up_read(&mm->mmap_sem); |
844 | return err; | 859 | return err; |
@@ -959,7 +974,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, | |||
959 | 974 | ||
960 | err = -ENOENT; | 975 | err = -ENOENT; |
961 | /* Use PageReserved to check for zero page */ | 976 | /* Use PageReserved to check for zero page */ |
962 | if (!page || PageReserved(page)) | 977 | if (!page || PageReserved(page) || PageKsm(page)) |
963 | goto set_status; | 978 | goto set_status; |
964 | 979 | ||
965 | err = page_to_nid(page); | 980 | err = page_to_nid(page); |