aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>2010-09-07 21:19:35 -0400
committerAndi Kleen <ak@linux.intel.com>2010-10-08 03:32:45 -0400
commit290408d4a25002f099efeee7b6a5778d431154d6 (patch)
treec0a69a11ce963ef8a22607f5e782ae667ca5d538
parent0ebabb416f585ace711769057422af4bbc9d1110 (diff)
hugetlb: hugepage migration core
This patch extends page migration code to support hugepage migration. One of the potential users of this feature is soft offlining which is triggered by memory corrected errors (added by the next patch.) Todo: - there are other users of page migration such as memory policy, memory hotplug and memocy compaction. They are not ready for hugepage support for now. ChangeLog since v4: - define migrate_huge_pages() - remove changes on isolation/putback_lru_page() ChangeLog since v2: - refactor isolate/putback_lru_page() to handle hugepage - add comment about race on unmap_and_move_huge_page() ChangeLog since v1: - divide migration code path for hugepage - define routine checking migration swap entry for hugetlb - replace "goto" with "if/else" in remove_migration_pte() Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andi Kleen <ak@linux.intel.com>
-rw-r--r--fs/hugetlbfs/inode.c15
-rw-r--r--include/linux/migrate.h16
-rw-r--r--mm/hugetlb.c18
-rw-r--r--mm/migrate.c232
4 files changed, 262 insertions, 19 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6e5bd42f3860..1f7ca505d48e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
31#include <linux/statfs.h> 31#include <linux/statfs.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/magic.h> 33#include <linux/magic.h>
34#include <linux/migrate.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36 37
@@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
573 return 0; 574 return 0;
574} 575}
575 576
577static int hugetlbfs_migrate_page(struct address_space *mapping,
578 struct page *newpage, struct page *page)
579{
580 int rc;
581
582 rc = migrate_huge_page_move_mapping(mapping, newpage, page);
583 if (rc)
584 return rc;
585 migrate_page_copy(newpage, page);
586
587 return 0;
588}
589
576static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 590static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
577{ 591{
578 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); 592 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = {
659 .write_begin = hugetlbfs_write_begin, 673 .write_begin = hugetlbfs_write_begin,
660 .write_end = hugetlbfs_write_end, 674 .write_end = hugetlbfs_write_end,
661 .set_page_dirty = hugetlbfs_set_page_dirty, 675 .set_page_dirty = hugetlbfs_set_page_dirty,
676 .migratepage = hugetlbfs_migrate_page,
662}; 677};
663 678
664 679
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 7238231b8dd4..3c1941e40e61 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *,
14 struct page *, struct page *); 14 struct page *, struct page *);
15extern int migrate_pages(struct list_head *l, new_page_t x, 15extern int migrate_pages(struct list_head *l, new_page_t x,
16 unsigned long private, int offlining); 16 unsigned long private, int offlining);
17extern int migrate_huge_pages(struct list_head *l, new_page_t x,
18 unsigned long private, int offlining);
17 19
18extern int fail_migrate_page(struct address_space *, 20extern int fail_migrate_page(struct address_space *,
19 struct page *, struct page *); 21 struct page *, struct page *);
@@ -23,12 +25,17 @@ extern int migrate_prep_local(void);
23extern int migrate_vmas(struct mm_struct *mm, 25extern int migrate_vmas(struct mm_struct *mm,
24 const nodemask_t *from, const nodemask_t *to, 26 const nodemask_t *from, const nodemask_t *to,
25 unsigned long flags); 27 unsigned long flags);
28extern void migrate_page_copy(struct page *newpage, struct page *page);
29extern int migrate_huge_page_move_mapping(struct address_space *mapping,
30 struct page *newpage, struct page *page);
26#else 31#else
27#define PAGE_MIGRATION 0 32#define PAGE_MIGRATION 0
28 33
29static inline void putback_lru_pages(struct list_head *l) {} 34static inline void putback_lru_pages(struct list_head *l) {}
30static inline int migrate_pages(struct list_head *l, new_page_t x, 35static inline int migrate_pages(struct list_head *l, new_page_t x,
31 unsigned long private, int offlining) { return -ENOSYS; } 36 unsigned long private, int offlining) { return -ENOSYS; }
37static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
38 unsigned long private, int offlining) { return -ENOSYS; }
32 39
33static inline int migrate_prep(void) { return -ENOSYS; } 40static inline int migrate_prep(void) { return -ENOSYS; }
34static inline int migrate_prep_local(void) { return -ENOSYS; } 41static inline int migrate_prep_local(void) { return -ENOSYS; }
@@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm,
40 return -ENOSYS; 47 return -ENOSYS;
41} 48}
42 49
50static inline void migrate_page_copy(struct page *newpage,
51 struct page *page) {}
52
53extern int migrate_huge_page_move_mapping(struct address_space *mapping,
54 struct page *newpage, struct page *page)
55{
56 return -ENOSYS;
57}
58
43/* Possible settings for the migrate_page() method in address_operations */ 59/* Possible settings for the migrate_page() method in address_operations */
44#define migrate_page NULL 60#define migrate_page NULL
45#define fail_migrate_page NULL 61#define fail_migrate_page NULL
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a73dbdcb89eb..0fa9de8361bd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2217,6 +2217,19 @@ nomem:
2217 return -ENOMEM; 2217 return -ENOMEM;
2218} 2218}
2219 2219
2220static int is_hugetlb_entry_migration(pte_t pte)
2221{
2222 swp_entry_t swp;
2223
2224 if (huge_pte_none(pte) || pte_present(pte))
2225 return 0;
2226 swp = pte_to_swp_entry(pte);
2227 if (non_swap_entry(swp) && is_migration_entry(swp)) {
2228 return 1;
2229 } else
2230 return 0;
2231}
2232
2220static int is_hugetlb_entry_hwpoisoned(pte_t pte) 2233static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2221{ 2234{
2222 swp_entry_t swp; 2235 swp_entry_t swp;
@@ -2648,7 +2661,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2648 ptep = huge_pte_offset(mm, address); 2661 ptep = huge_pte_offset(mm, address);
2649 if (ptep) { 2662 if (ptep) {
2650 entry = huge_ptep_get(ptep); 2663 entry = huge_ptep_get(ptep);
2651 if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2664 if (unlikely(is_hugetlb_entry_migration(entry))) {
2665 migration_entry_wait(mm, (pmd_t *)ptep, address);
2666 return 0;
2667 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2652 return VM_FAULT_HWPOISON; 2668 return VM_FAULT_HWPOISON;
2653 } 2669 }
2654 2670
diff --git a/mm/migrate.c b/mm/migrate.c
index 38e7cad782f4..55dbc45880c6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,7 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/memcontrol.h> 33#include <linux/memcontrol.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/hugetlb.h>
35#include <linux/gfp.h> 36#include <linux/gfp.h>
36 37
37#include "internal.h" 38#include "internal.h"
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
95 pte_t *ptep, pte; 96 pte_t *ptep, pte;
96 spinlock_t *ptl; 97 spinlock_t *ptl;
97 98
98 pgd = pgd_offset(mm, addr); 99 if (unlikely(PageHuge(new))) {
99 if (!pgd_present(*pgd)) 100 ptep = huge_pte_offset(mm, addr);
100 goto out; 101 if (!ptep)
102 goto out;
103 ptl = &mm->page_table_lock;
104 } else {
105 pgd = pgd_offset(mm, addr);
106 if (!pgd_present(*pgd))
107 goto out;
101 108
102 pud = pud_offset(pgd, addr); 109 pud = pud_offset(pgd, addr);
103 if (!pud_present(*pud)) 110 if (!pud_present(*pud))
104 goto out; 111 goto out;
105 112
106 pmd = pmd_offset(pud, addr); 113 pmd = pmd_offset(pud, addr);
107 if (!pmd_present(*pmd)) 114 if (!pmd_present(*pmd))
108 goto out; 115 goto out;
109 116
110 ptep = pte_offset_map(pmd, addr); 117 ptep = pte_offset_map(pmd, addr);
111 118
112 if (!is_swap_pte(*ptep)) { 119 if (!is_swap_pte(*ptep)) {
113 pte_unmap(ptep); 120 pte_unmap(ptep);
114 goto out; 121 goto out;
115 } 122 }
123
124 ptl = pte_lockptr(mm, pmd);
125 }
116 126
117 ptl = pte_lockptr(mm, pmd);
118 spin_lock(ptl); 127 spin_lock(ptl);
119 pte = *ptep; 128 pte = *ptep;
120 if (!is_swap_pte(pte)) 129 if (!is_swap_pte(pte))
@@ -130,10 +139,17 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
130 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 139 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
131 if (is_write_migration_entry(entry)) 140 if (is_write_migration_entry(entry))
132 pte = pte_mkwrite(pte); 141 pte = pte_mkwrite(pte);
142 if (PageHuge(new))
143 pte = pte_mkhuge(pte);
133 flush_cache_page(vma, addr, pte_pfn(pte)); 144 flush_cache_page(vma, addr, pte_pfn(pte));
134 set_pte_at(mm, addr, ptep, pte); 145 set_pte_at(mm, addr, ptep, pte);
135 146
136 if (PageAnon(new)) 147 if (PageHuge(new)) {
148 if (PageAnon(new))
149 hugepage_add_anon_rmap(new, vma, addr);
150 else
151 page_dup_rmap(new);
152 } else if (PageAnon(new))
137 page_add_anon_rmap(new, vma, addr); 153 page_add_anon_rmap(new, vma, addr);
138 else 154 else
139 page_add_file_rmap(new); 155 page_add_file_rmap(new);
@@ -276,11 +292,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
276} 292}
277 293
278/* 294/*
295 * The expected number of remaining references is the same as that
296 * of migrate_page_move_mapping().
297 */
298int migrate_huge_page_move_mapping(struct address_space *mapping,
299 struct page *newpage, struct page *page)
300{
301 int expected_count;
302 void **pslot;
303
304 if (!mapping) {
305 if (page_count(page) != 1)
306 return -EAGAIN;
307 return 0;
308 }
309
310 spin_lock_irq(&mapping->tree_lock);
311
312 pslot = radix_tree_lookup_slot(&mapping->page_tree,
313 page_index(page));
314
315 expected_count = 2 + page_has_private(page);
316 if (page_count(page) != expected_count ||
317 (struct page *)radix_tree_deref_slot(pslot) != page) {
318 spin_unlock_irq(&mapping->tree_lock);
319 return -EAGAIN;
320 }
321
322 if (!page_freeze_refs(page, expected_count)) {
323 spin_unlock_irq(&mapping->tree_lock);
324 return -EAGAIN;
325 }
326
327 get_page(newpage);
328
329 radix_tree_replace_slot(pslot, newpage);
330
331 page_unfreeze_refs(page, expected_count);
332
333 __put_page(page);
334
335 spin_unlock_irq(&mapping->tree_lock);
336 return 0;
337}
338
339/*
279 * Copy the page to its new location 340 * Copy the page to its new location
280 */ 341 */
281static void migrate_page_copy(struct page *newpage, struct page *page) 342void migrate_page_copy(struct page *newpage, struct page *page)
282{ 343{
283 copy_highpage(newpage, page); 344 if (PageHuge(page))
345 copy_huge_page(newpage, page);
346 else
347 copy_highpage(newpage, page);
284 348
285 if (PageError(page)) 349 if (PageError(page))
286 SetPageError(newpage); 350 SetPageError(newpage);
@@ -724,6 +788,92 @@ move_newpage:
724} 788}
725 789
726/* 790/*
791 * Counterpart of unmap_and_move_page() for hugepage migration.
792 *
793 * This function doesn't wait the completion of hugepage I/O
794 * because there is no race between I/O and migration for hugepage.
795 * Note that currently hugepage I/O occurs only in direct I/O
796 * where no lock is held and PG_writeback is irrelevant,
797 * and writeback status of all subpages are counted in the reference
798 * count of the head page (i.e. if all subpages of a 2MB hugepage are
799 * under direct I/O, the reference of the head page is 512 and a bit more.)
800 * This means that when we try to migrate hugepage whose subpages are
801 * doing direct I/O, some references remain after try_to_unmap() and
802 * hugepage migration fails without data corruption.
803 *
804 * There is also no race when direct I/O is issued on the page under migration,
805 * because then pte is replaced with migration swap entry and direct I/O code
806 * will wait in the page fault for migration to complete.
807 */
808static int unmap_and_move_huge_page(new_page_t get_new_page,
809 unsigned long private, struct page *hpage,
810 int force, int offlining)
811{
812 int rc = 0;
813 int *result = NULL;
814 struct page *new_hpage = get_new_page(hpage, private, &result);
815 int rcu_locked = 0;
816 struct anon_vma *anon_vma = NULL;
817
818 if (!new_hpage)
819 return -ENOMEM;
820
821 rc = -EAGAIN;
822
823 if (!trylock_page(hpage)) {
824 if (!force)
825 goto out;
826 lock_page(hpage);
827 }
828
829 if (PageAnon(hpage)) {
830 rcu_read_lock();
831 rcu_locked = 1;
832
833 if (page_mapped(hpage)) {
834 anon_vma = page_anon_vma(hpage);
835 atomic_inc(&anon_vma->external_refcount);
836 }
837 }
838
839 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
840
841 if (!page_mapped(hpage))
842 rc = move_to_new_page(new_hpage, hpage, 1);
843
844 if (rc)
845 remove_migration_ptes(hpage, hpage);
846
847 if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
848 &anon_vma->lock)) {
849 int empty = list_empty(&anon_vma->head);
850 spin_unlock(&anon_vma->lock);
851 if (empty)
852 anon_vma_free(anon_vma);
853 }
854
855 if (rcu_locked)
856 rcu_read_unlock();
857out:
858 unlock_page(hpage);
859
860 if (rc != -EAGAIN) {
861 list_del(&hpage->lru);
862 put_page(hpage);
863 }
864
865 put_page(new_hpage);
866
867 if (result) {
868 if (rc)
869 *result = rc;
870 else
871 *result = page_to_nid(new_hpage);
872 }
873 return rc;
874}
875
876/*
727 * migrate_pages 877 * migrate_pages
728 * 878 *
729 * The function takes one list of pages to migrate and a function 879 * The function takes one list of pages to migrate and a function
@@ -788,6 +938,52 @@ out:
788 return nr_failed + retry; 938 return nr_failed + retry;
789} 939}
790 940
941int migrate_huge_pages(struct list_head *from,
942 new_page_t get_new_page, unsigned long private, int offlining)
943{
944 int retry = 1;
945 int nr_failed = 0;
946 int pass = 0;
947 struct page *page;
948 struct page *page2;
949 int rc;
950
951 for (pass = 0; pass < 10 && retry; pass++) {
952 retry = 0;
953
954 list_for_each_entry_safe(page, page2, from, lru) {
955 cond_resched();
956
957 rc = unmap_and_move_huge_page(get_new_page,
958 private, page, pass > 2, offlining);
959
960 switch(rc) {
961 case -ENOMEM:
962 goto out;
963 case -EAGAIN:
964 retry++;
965 break;
966 case 0:
967 break;
968 default:
969 /* Permanent failure */
970 nr_failed++;
971 break;
972 }
973 }
974 }
975 rc = 0;
976out:
977
978 list_for_each_entry_safe(page, page2, from, lru)
979 put_page(page);
980
981 if (rc)
982 return rc;
983
984 return nr_failed + retry;
985}
986
791#ifdef CONFIG_NUMA 987#ifdef CONFIG_NUMA
792/* 988/*
793 * Move a list of individual pages 989 * Move a list of individual pages