diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-26 13:13:10 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-26 13:13:10 -0400 |
commit | f1ebdd60cc73ed36fd977f7e719ce70d2f5cd1c0 (patch) | |
tree | 225cb2ea2c0f8990f29383058a07206cfd835893 | |
parent | f99d055398d53c8f769d5153b3fdce1d2556e7ff (diff) | |
parent | 46e387bbd82d438b9131e237e6e2cb55a825da49 (diff) |
Merge branch 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6
* 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6: (22 commits)
Add _addr_lsb field to ia64 siginfo
Fix migration.c compilation on s390
HWPOISON: Remove retry loop for try_to_unmap
HWPOISON: Turn addr_valid from bitfield into char
HWPOISON: Disable DEBUG by default
HWPOISON: Convert pr_debugs to pr_info
HWPOISON: Improve comments in memory-failure.c
x86: HWPOISON: Report correct address granuality for huge hwpoison faults
Encode huge page size for VM_FAULT_HWPOISON errors
Fix build error with !CONFIG_MIGRATION
hugepage: move is_hugepage_on_freelist inside ifdef to avoid warning
Clean up __page_set_anon_rmap
HWPOISON, hugetlb: fix unpoison for hugepage
HWPOISON, hugetlb: soft offlining for hugepage
HWPOSION, hugetlb: recover from free hugepage error when !MF_COUNT_INCREASED
hugetlb: move refcounting in hugepage allocation inside hugetlb_lock
HWPOISON, hugetlb: add free check to dequeue_hwpoison_huge_page()
hugetlb: hugepage migration core
hugetlb: redefine hugepage copy functions
hugetlb: add allocate function for hugepage migration
...
-rw-r--r-- | arch/ia64/include/asm/siginfo.h | 1 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 19 | ||||
-rw-r--r-- | fs/hugetlbfs/inode.c | 15 | ||||
-rw-r--r-- | fs/signalfd.c | 10 | ||||
-rw-r--r-- | include/linux/hugetlb.h | 17 | ||||
-rw-r--r-- | include/linux/migrate.h | 16 | ||||
-rw-r--r-- | include/linux/mm.h | 12 | ||||
-rw-r--r-- | include/linux/signalfd.h | 3 | ||||
-rw-r--r-- | mm/hugetlb.c | 233 | ||||
-rw-r--r-- | mm/memory-failure.c | 175 | ||||
-rw-r--r-- | mm/memory.c | 3 | ||||
-rw-r--r-- | mm/migrate.c | 234 | ||||
-rw-r--r-- | mm/rmap.c | 25 |
13 files changed, 596 insertions, 167 deletions
diff --git a/arch/ia64/include/asm/siginfo.h b/arch/ia64/include/asm/siginfo.h index 118d42979003..c8fcaa2ac48f 100644 --- a/arch/ia64/include/asm/siginfo.h +++ b/arch/ia64/include/asm/siginfo.h | |||
@@ -62,6 +62,7 @@ typedef struct siginfo { | |||
62 | int _imm; /* immediate value for "break" */ | 62 | int _imm; /* immediate value for "break" */ |
63 | unsigned int _flags; /* see below */ | 63 | unsigned int _flags; /* see below */ |
64 | unsigned long _isr; /* isr */ | 64 | unsigned long _isr; /* isr */ |
65 | short _addr_lsb; /* lsb of faulting address */ | ||
65 | } _sigfault; | 66 | } _sigfault; |
66 | 67 | ||
67 | /* SIGPOLL */ | 68 | /* SIGPOLL */ |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 79b0b372d2d0..852b319edbdc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/kprobes.h> /* __kprobes, ... */ | 11 | #include <linux/kprobes.h> /* __kprobes, ... */ |
12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ | 12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ |
13 | #include <linux/perf_event.h> /* perf_sw_event */ | 13 | #include <linux/perf_event.h> /* perf_sw_event */ |
14 | #include <linux/hugetlb.h> /* hstate_index_to_shift */ | ||
14 | 15 | ||
15 | #include <asm/traps.h> /* dotraplinkage, ... */ | 16 | #include <asm/traps.h> /* dotraplinkage, ... */ |
16 | #include <asm/pgalloc.h> /* pgd_*(), ... */ | 17 | #include <asm/pgalloc.h> /* pgd_*(), ... */ |
@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) | |||
160 | 161 | ||
161 | static void | 162 | static void |
162 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, | 163 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, |
163 | struct task_struct *tsk) | 164 | struct task_struct *tsk, int fault) |
164 | { | 165 | { |
166 | unsigned lsb = 0; | ||
165 | siginfo_t info; | 167 | siginfo_t info; |
166 | 168 | ||
167 | info.si_signo = si_signo; | 169 | info.si_signo = si_signo; |
168 | info.si_errno = 0; | 170 | info.si_errno = 0; |
169 | info.si_code = si_code; | 171 | info.si_code = si_code; |
170 | info.si_addr = (void __user *)address; | 172 | info.si_addr = (void __user *)address; |
171 | info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; | 173 | if (fault & VM_FAULT_HWPOISON_LARGE) |
174 | lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); | ||
175 | if (fault & VM_FAULT_HWPOISON) | ||
176 | lsb = PAGE_SHIFT; | ||
177 | info.si_addr_lsb = lsb; | ||
172 | 178 | ||
173 | force_sig_info(si_signo, &info, tsk); | 179 | force_sig_info(si_signo, &info, tsk); |
174 | } | 180 | } |
@@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, | |||
722 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); | 728 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); |
723 | tsk->thread.trap_no = 14; | 729 | tsk->thread.trap_no = 14; |
724 | 730 | ||
725 | force_sig_info_fault(SIGSEGV, si_code, address, tsk); | 731 | force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); |
726 | 732 | ||
727 | return; | 733 | return; |
728 | } | 734 | } |
@@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, | |||
807 | tsk->thread.trap_no = 14; | 813 | tsk->thread.trap_no = 14; |
808 | 814 | ||
809 | #ifdef CONFIG_MEMORY_FAILURE | 815 | #ifdef CONFIG_MEMORY_FAILURE |
810 | if (fault & VM_FAULT_HWPOISON) { | 816 | if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { |
811 | printk(KERN_ERR | 817 | printk(KERN_ERR |
812 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", | 818 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", |
813 | tsk->comm, tsk->pid, address); | 819 | tsk->comm, tsk->pid, address); |
814 | code = BUS_MCEERR_AR; | 820 | code = BUS_MCEERR_AR; |
815 | } | 821 | } |
816 | #endif | 822 | #endif |
817 | force_sig_info_fault(SIGBUS, code, address, tsk); | 823 | force_sig_info_fault(SIGBUS, code, address, tsk, fault); |
818 | } | 824 | } |
819 | 825 | ||
820 | static noinline void | 826 | static noinline void |
@@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, | |||
824 | if (fault & VM_FAULT_OOM) { | 830 | if (fault & VM_FAULT_OOM) { |
825 | out_of_memory(regs, error_code, address); | 831 | out_of_memory(regs, error_code, address); |
826 | } else { | 832 | } else { |
827 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) | 833 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| |
834 | VM_FAULT_HWPOISON_LARGE)) | ||
828 | do_sigbus(regs, error_code, address, fault); | 835 | do_sigbus(regs, error_code, address, fault); |
829 | else | 836 | else |
830 | BUG(); | 837 | BUG(); |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 113eba3d3c38..a14328d270e8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/statfs.h> | 31 | #include <linux/statfs.h> |
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include <linux/magic.h> | 33 | #include <linux/magic.h> |
34 | #include <linux/migrate.h> | ||
34 | 35 | ||
35 | #include <asm/uaccess.h> | 36 | #include <asm/uaccess.h> |
36 | 37 | ||
@@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page) | |||
573 | return 0; | 574 | return 0; |
574 | } | 575 | } |
575 | 576 | ||
577 | static int hugetlbfs_migrate_page(struct address_space *mapping, | ||
578 | struct page *newpage, struct page *page) | ||
579 | { | ||
580 | int rc; | ||
581 | |||
582 | rc = migrate_huge_page_move_mapping(mapping, newpage, page); | ||
583 | if (rc) | ||
584 | return rc; | ||
585 | migrate_page_copy(newpage, page); | ||
586 | |||
587 | return 0; | ||
588 | } | ||
589 | |||
576 | static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) | 590 | static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) |
577 | { | 591 | { |
578 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); | 592 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); |
@@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = { | |||
659 | .write_begin = hugetlbfs_write_begin, | 673 | .write_begin = hugetlbfs_write_begin, |
660 | .write_end = hugetlbfs_write_end, | 674 | .write_end = hugetlbfs_write_end, |
661 | .set_page_dirty = hugetlbfs_set_page_dirty, | 675 | .set_page_dirty = hugetlbfs_set_page_dirty, |
676 | .migratepage = hugetlbfs_migrate_page, | ||
662 | }; | 677 | }; |
663 | 678 | ||
664 | 679 | ||
diff --git a/fs/signalfd.c b/fs/signalfd.c index 74047304b01a..492465b451dd 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c | |||
@@ -99,6 +99,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, | |||
99 | #ifdef __ARCH_SI_TRAPNO | 99 | #ifdef __ARCH_SI_TRAPNO |
100 | err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); | 100 | err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); |
101 | #endif | 101 | #endif |
102 | #ifdef BUS_MCEERR_AO | ||
103 | /* | ||
104 | * Other callers might not initialize the si_lsb field, | ||
105 | * so check explicitly for the right codes here. | ||
106 | */ | ||
107 | if (kinfo->si_code == BUS_MCEERR_AR || | ||
108 | kinfo->si_code == BUS_MCEERR_AO) | ||
109 | err |= __put_user((short) kinfo->si_addr_lsb, | ||
110 | &uinfo->ssi_addr_lsb); | ||
111 | #endif | ||
102 | break; | 112 | break; |
103 | case __SI_CHLD: | 113 | case __SI_CHLD: |
104 | err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); | 114 | err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f479700df61b..943c76b3d4bb 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -43,7 +43,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to, | |||
43 | struct vm_area_struct *vma, | 43 | struct vm_area_struct *vma, |
44 | int acctflags); | 44 | int acctflags); |
45 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); | 45 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); |
46 | void __isolate_hwpoisoned_huge_page(struct page *page); | 46 | int dequeue_hwpoisoned_huge_page(struct page *page); |
47 | void copy_huge_page(struct page *dst, struct page *src); | ||
47 | 48 | ||
48 | extern unsigned long hugepages_treat_as_movable; | 49 | extern unsigned long hugepages_treat_as_movable; |
49 | extern const unsigned long hugetlb_zero, hugetlb_infinity; | 50 | extern const unsigned long hugetlb_zero, hugetlb_infinity; |
@@ -101,7 +102,10 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) | |||
101 | #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) | 102 | #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) |
102 | #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) | 103 | #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) |
103 | #define huge_pte_offset(mm, address) 0 | 104 | #define huge_pte_offset(mm, address) 0 |
104 | #define __isolate_hwpoisoned_huge_page(page) 0 | 105 | #define dequeue_hwpoisoned_huge_page(page) 0 |
106 | static inline void copy_huge_page(struct page *dst, struct page *src) | ||
107 | { | ||
108 | } | ||
105 | 109 | ||
106 | #define hugetlb_change_protection(vma, address, end, newprot) | 110 | #define hugetlb_change_protection(vma, address, end, newprot) |
107 | 111 | ||
@@ -228,6 +232,8 @@ struct huge_bootmem_page { | |||
228 | struct hstate *hstate; | 232 | struct hstate *hstate; |
229 | }; | 233 | }; |
230 | 234 | ||
235 | struct page *alloc_huge_page_node(struct hstate *h, int nid); | ||
236 | |||
231 | /* arch callback */ | 237 | /* arch callback */ |
232 | int __init alloc_bootmem_huge_page(struct hstate *h); | 238 | int __init alloc_bootmem_huge_page(struct hstate *h); |
233 | 239 | ||
@@ -301,8 +307,14 @@ static inline struct hstate *page_hstate(struct page *page) | |||
301 | return size_to_hstate(PAGE_SIZE << compound_order(page)); | 307 | return size_to_hstate(PAGE_SIZE << compound_order(page)); |
302 | } | 308 | } |
303 | 309 | ||
310 | static inline unsigned hstate_index_to_shift(unsigned index) | ||
311 | { | ||
312 | return hstates[index].order + PAGE_SHIFT; | ||
313 | } | ||
314 | |||
304 | #else | 315 | #else |
305 | struct hstate {}; | 316 | struct hstate {}; |
317 | #define alloc_huge_page_node(h, nid) NULL | ||
306 | #define alloc_bootmem_huge_page(h) NULL | 318 | #define alloc_bootmem_huge_page(h) NULL |
307 | #define hstate_file(f) NULL | 319 | #define hstate_file(f) NULL |
308 | #define hstate_vma(v) NULL | 320 | #define hstate_vma(v) NULL |
@@ -317,6 +329,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h) | |||
317 | { | 329 | { |
318 | return 1; | 330 | return 1; |
319 | } | 331 | } |
332 | #define hstate_index_to_shift(index) 0 | ||
320 | #endif | 333 | #endif |
321 | 334 | ||
322 | #endif /* _LINUX_HUGETLB_H */ | 335 | #endif /* _LINUX_HUGETLB_H */ |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 7238231b8dd4..085527fb8261 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *, | |||
14 | struct page *, struct page *); | 14 | struct page *, struct page *); |
15 | extern int migrate_pages(struct list_head *l, new_page_t x, | 15 | extern int migrate_pages(struct list_head *l, new_page_t x, |
16 | unsigned long private, int offlining); | 16 | unsigned long private, int offlining); |
17 | extern int migrate_huge_pages(struct list_head *l, new_page_t x, | ||
18 | unsigned long private, int offlining); | ||
17 | 19 | ||
18 | extern int fail_migrate_page(struct address_space *, | 20 | extern int fail_migrate_page(struct address_space *, |
19 | struct page *, struct page *); | 21 | struct page *, struct page *); |
@@ -23,12 +25,17 @@ extern int migrate_prep_local(void); | |||
23 | extern int migrate_vmas(struct mm_struct *mm, | 25 | extern int migrate_vmas(struct mm_struct *mm, |
24 | const nodemask_t *from, const nodemask_t *to, | 26 | const nodemask_t *from, const nodemask_t *to, |
25 | unsigned long flags); | 27 | unsigned long flags); |
28 | extern void migrate_page_copy(struct page *newpage, struct page *page); | ||
29 | extern int migrate_huge_page_move_mapping(struct address_space *mapping, | ||
30 | struct page *newpage, struct page *page); | ||
26 | #else | 31 | #else |
27 | #define PAGE_MIGRATION 0 | 32 | #define PAGE_MIGRATION 0 |
28 | 33 | ||
29 | static inline void putback_lru_pages(struct list_head *l) {} | 34 | static inline void putback_lru_pages(struct list_head *l) {} |
30 | static inline int migrate_pages(struct list_head *l, new_page_t x, | 35 | static inline int migrate_pages(struct list_head *l, new_page_t x, |
31 | unsigned long private, int offlining) { return -ENOSYS; } | 36 | unsigned long private, int offlining) { return -ENOSYS; } |
37 | static inline int migrate_huge_pages(struct list_head *l, new_page_t x, | ||
38 | unsigned long private, int offlining) { return -ENOSYS; } | ||
32 | 39 | ||
33 | static inline int migrate_prep(void) { return -ENOSYS; } | 40 | static inline int migrate_prep(void) { return -ENOSYS; } |
34 | static inline int migrate_prep_local(void) { return -ENOSYS; } | 41 | static inline int migrate_prep_local(void) { return -ENOSYS; } |
@@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm, | |||
40 | return -ENOSYS; | 47 | return -ENOSYS; |
41 | } | 48 | } |
42 | 49 | ||
50 | static inline void migrate_page_copy(struct page *newpage, | ||
51 | struct page *page) {} | ||
52 | |||
53 | static inline int migrate_huge_page_move_mapping(struct address_space *mapping, | ||
54 | struct page *newpage, struct page *page) | ||
55 | { | ||
56 | return -ENOSYS; | ||
57 | } | ||
58 | |||
43 | /* Possible settings for the migrate_page() method in address_operations */ | 59 | /* Possible settings for the migrate_page() method in address_operations */ |
44 | #define migrate_page NULL | 60 | #define migrate_page NULL |
45 | #define fail_migrate_page NULL | 61 | #define fail_migrate_page NULL |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 7687228dd3b7..a4c66846fb8f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -718,12 +718,20 @@ static inline int page_mapped(struct page *page) | |||
718 | #define VM_FAULT_SIGBUS 0x0002 | 718 | #define VM_FAULT_SIGBUS 0x0002 |
719 | #define VM_FAULT_MAJOR 0x0004 | 719 | #define VM_FAULT_MAJOR 0x0004 |
720 | #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ | 720 | #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ |
721 | #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */ | 721 | #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */ |
722 | #define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */ | ||
722 | 723 | ||
723 | #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ | 724 | #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ |
724 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ | 725 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ |
725 | 726 | ||
726 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON) | 727 | #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ |
728 | |||
729 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ | ||
730 | VM_FAULT_HWPOISON_LARGE) | ||
731 | |||
732 | /* Encode hstate index for a hwpoisoned large page */ | ||
733 | #define VM_FAULT_SET_HINDEX(x) ((x) << 12) | ||
734 | #define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf) | ||
727 | 735 | ||
728 | /* | 736 | /* |
729 | * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. | 737 | * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. |
diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h index b363b916c909..3ff4961da9b5 100644 --- a/include/linux/signalfd.h +++ b/include/linux/signalfd.h | |||
@@ -33,6 +33,7 @@ struct signalfd_siginfo { | |||
33 | __u64 ssi_utime; | 33 | __u64 ssi_utime; |
34 | __u64 ssi_stime; | 34 | __u64 ssi_stime; |
35 | __u64 ssi_addr; | 35 | __u64 ssi_addr; |
36 | __u16 ssi_addr_lsb; | ||
36 | 37 | ||
37 | /* | 38 | /* |
38 | * Pad strcture to 128 bytes. Remember to update the | 39 | * Pad strcture to 128 bytes. Remember to update the |
@@ -43,7 +44,7 @@ struct signalfd_siginfo { | |||
43 | * comes out of a read(2) and we really don't want to have | 44 | * comes out of a read(2) and we really don't want to have |
44 | * a compat on read(2). | 45 | * a compat on read(2). |
45 | */ | 46 | */ |
46 | __u8 __pad[48]; | 47 | __u8 __pad[46]; |
47 | }; | 48 | }; |
48 | 49 | ||
49 | 50 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c03273807182..96991ded82fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page, | |||
423 | } | 423 | } |
424 | } | 424 | } |
425 | 425 | ||
426 | static void copy_gigantic_page(struct page *dst, struct page *src, | 426 | static void copy_user_gigantic_page(struct page *dst, struct page *src, |
427 | unsigned long addr, struct vm_area_struct *vma) | 427 | unsigned long addr, struct vm_area_struct *vma) |
428 | { | 428 | { |
429 | int i; | 429 | int i; |
430 | struct hstate *h = hstate_vma(vma); | 430 | struct hstate *h = hstate_vma(vma); |
431 | struct page *dst_base = dst; | 431 | struct page *dst_base = dst; |
432 | struct page *src_base = src; | 432 | struct page *src_base = src; |
433 | might_sleep(); | 433 | |
434 | for (i = 0; i < pages_per_huge_page(h); ) { | 434 | for (i = 0; i < pages_per_huge_page(h); ) { |
435 | cond_resched(); | 435 | cond_resched(); |
436 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | 436 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); |
@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src, | |||
440 | src = mem_map_next(src, src_base, i); | 440 | src = mem_map_next(src, src_base, i); |
441 | } | 441 | } |
442 | } | 442 | } |
443 | static void copy_huge_page(struct page *dst, struct page *src, | 443 | |
444 | static void copy_user_huge_page(struct page *dst, struct page *src, | ||
444 | unsigned long addr, struct vm_area_struct *vma) | 445 | unsigned long addr, struct vm_area_struct *vma) |
445 | { | 446 | { |
446 | int i; | 447 | int i; |
447 | struct hstate *h = hstate_vma(vma); | 448 | struct hstate *h = hstate_vma(vma); |
448 | 449 | ||
449 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { | 450 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { |
450 | copy_gigantic_page(dst, src, addr, vma); | 451 | copy_user_gigantic_page(dst, src, addr, vma); |
451 | return; | 452 | return; |
452 | } | 453 | } |
453 | 454 | ||
@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src, | |||
458 | } | 459 | } |
459 | } | 460 | } |
460 | 461 | ||
462 | static void copy_gigantic_page(struct page *dst, struct page *src) | ||
463 | { | ||
464 | int i; | ||
465 | struct hstate *h = page_hstate(src); | ||
466 | struct page *dst_base = dst; | ||
467 | struct page *src_base = src; | ||
468 | |||
469 | for (i = 0; i < pages_per_huge_page(h); ) { | ||
470 | cond_resched(); | ||
471 | copy_highpage(dst, src); | ||
472 | |||
473 | i++; | ||
474 | dst = mem_map_next(dst, dst_base, i); | ||
475 | src = mem_map_next(src, src_base, i); | ||
476 | } | ||
477 | } | ||
478 | |||
479 | void copy_huge_page(struct page *dst, struct page *src) | ||
480 | { | ||
481 | int i; | ||
482 | struct hstate *h = page_hstate(src); | ||
483 | |||
484 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { | ||
485 | copy_gigantic_page(dst, src); | ||
486 | return; | ||
487 | } | ||
488 | |||
489 | might_sleep(); | ||
490 | for (i = 0; i < pages_per_huge_page(h); i++) { | ||
491 | cond_resched(); | ||
492 | copy_highpage(dst + i, src + i); | ||
493 | } | ||
494 | } | ||
495 | |||
461 | static void enqueue_huge_page(struct hstate *h, struct page *page) | 496 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
462 | { | 497 | { |
463 | int nid = page_to_nid(page); | 498 | int nid = page_to_nid(page); |
@@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) | |||
466 | h->free_huge_pages_node[nid]++; | 501 | h->free_huge_pages_node[nid]++; |
467 | } | 502 | } |
468 | 503 | ||
504 | static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | ||
505 | { | ||
506 | struct page *page; | ||
507 | |||
508 | if (list_empty(&h->hugepage_freelists[nid])) | ||
509 | return NULL; | ||
510 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); | ||
511 | list_del(&page->lru); | ||
512 | set_page_refcounted(page); | ||
513 | h->free_huge_pages--; | ||
514 | h->free_huge_pages_node[nid]--; | ||
515 | return page; | ||
516 | } | ||
517 | |||
469 | static struct page *dequeue_huge_page_vma(struct hstate *h, | 518 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
470 | struct vm_area_struct *vma, | 519 | struct vm_area_struct *vma, |
471 | unsigned long address, int avoid_reserve) | 520 | unsigned long address, int avoid_reserve) |
472 | { | 521 | { |
473 | int nid; | ||
474 | struct page *page = NULL; | 522 | struct page *page = NULL; |
475 | struct mempolicy *mpol; | 523 | struct mempolicy *mpol; |
476 | nodemask_t *nodemask; | 524 | nodemask_t *nodemask; |
@@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
496 | 544 | ||
497 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 545 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
498 | MAX_NR_ZONES - 1, nodemask) { | 546 | MAX_NR_ZONES - 1, nodemask) { |
499 | nid = zone_to_nid(zone); | 547 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { |
500 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && | 548 | page = dequeue_huge_page_node(h, zone_to_nid(zone)); |
501 | !list_empty(&h->hugepage_freelists[nid])) { | 549 | if (page) { |
502 | page = list_entry(h->hugepage_freelists[nid].next, | 550 | if (!avoid_reserve) |
503 | struct page, lru); | 551 | decrement_hugepage_resv_vma(h, vma); |
504 | list_del(&page->lru); | 552 | break; |
505 | h->free_huge_pages--; | 553 | } |
506 | h->free_huge_pages_node[nid]--; | ||
507 | |||
508 | if (!avoid_reserve) | ||
509 | decrement_hugepage_resv_vma(h, vma); | ||
510 | |||
511 | break; | ||
512 | } | 554 | } |
513 | } | 555 | } |
514 | err: | 556 | err: |
@@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, | |||
770 | return ret; | 812 | return ret; |
771 | } | 813 | } |
772 | 814 | ||
773 | static struct page *alloc_buddy_huge_page(struct hstate *h, | 815 | static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) |
774 | struct vm_area_struct *vma, unsigned long address) | ||
775 | { | 816 | { |
776 | struct page *page; | 817 | struct page *page; |
777 | unsigned int nid; | 818 | unsigned int r_nid; |
778 | 819 | ||
779 | if (h->order >= MAX_ORDER) | 820 | if (h->order >= MAX_ORDER) |
780 | return NULL; | 821 | return NULL; |
@@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
812 | } | 853 | } |
813 | spin_unlock(&hugetlb_lock); | 854 | spin_unlock(&hugetlb_lock); |
814 | 855 | ||
815 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| | 856 | if (nid == NUMA_NO_NODE) |
816 | __GFP_REPEAT|__GFP_NOWARN, | 857 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| |
817 | huge_page_order(h)); | 858 | __GFP_REPEAT|__GFP_NOWARN, |
859 | huge_page_order(h)); | ||
860 | else | ||
861 | page = alloc_pages_exact_node(nid, | ||
862 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | ||
863 | __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); | ||
818 | 864 | ||
819 | if (page && arch_prepare_hugepage(page)) { | 865 | if (page && arch_prepare_hugepage(page)) { |
820 | __free_pages(page, huge_page_order(h)); | 866 | __free_pages(page, huge_page_order(h)); |
@@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
823 | 869 | ||
824 | spin_lock(&hugetlb_lock); | 870 | spin_lock(&hugetlb_lock); |
825 | if (page) { | 871 | if (page) { |
826 | /* | 872 | r_nid = page_to_nid(page); |
827 | * This page is now managed by the hugetlb allocator and has | ||
828 | * no users -- drop the buddy allocator's reference. | ||
829 | */ | ||
830 | put_page_testzero(page); | ||
831 | VM_BUG_ON(page_count(page)); | ||
832 | nid = page_to_nid(page); | ||
833 | set_compound_page_dtor(page, free_huge_page); | 873 | set_compound_page_dtor(page, free_huge_page); |
834 | /* | 874 | /* |
835 | * We incremented the global counters already | 875 | * We incremented the global counters already |
836 | */ | 876 | */ |
837 | h->nr_huge_pages_node[nid]++; | 877 | h->nr_huge_pages_node[r_nid]++; |
838 | h->surplus_huge_pages_node[nid]++; | 878 | h->surplus_huge_pages_node[r_nid]++; |
839 | __count_vm_event(HTLB_BUDDY_PGALLOC); | 879 | __count_vm_event(HTLB_BUDDY_PGALLOC); |
840 | } else { | 880 | } else { |
841 | h->nr_huge_pages--; | 881 | h->nr_huge_pages--; |
@@ -848,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
848 | } | 888 | } |
849 | 889 | ||
850 | /* | 890 | /* |
891 | * This allocation function is useful in the context where vma is irrelevant. | ||
892 | * E.g. soft-offlining uses this function because it only cares physical | ||
893 | * address of error page. | ||
894 | */ | ||
895 | struct page *alloc_huge_page_node(struct hstate *h, int nid) | ||
896 | { | ||
897 | struct page *page; | ||
898 | |||
899 | spin_lock(&hugetlb_lock); | ||
900 | page = dequeue_huge_page_node(h, nid); | ||
901 | spin_unlock(&hugetlb_lock); | ||
902 | |||
903 | if (!page) | ||
904 | page = alloc_buddy_huge_page(h, nid); | ||
905 | |||
906 | return page; | ||
907 | } | ||
908 | |||
909 | /* | ||
851 | * Increase the hugetlb pool such that it can accomodate a reservation | 910 | * Increase the hugetlb pool such that it can accomodate a reservation |
852 | * of size 'delta'. | 911 | * of size 'delta'. |
853 | */ | 912 | */ |
@@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta) | |||
871 | retry: | 930 | retry: |
872 | spin_unlock(&hugetlb_lock); | 931 | spin_unlock(&hugetlb_lock); |
873 | for (i = 0; i < needed; i++) { | 932 | for (i = 0; i < needed; i++) { |
874 | page = alloc_buddy_huge_page(h, NULL, 0); | 933 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
875 | if (!page) { | 934 | if (!page) |
876 | /* | 935 | /* |
877 | * We were not able to allocate enough pages to | 936 | * We were not able to allocate enough pages to |
878 | * satisfy the entire reservation so we free what | 937 | * satisfy the entire reservation so we free what |
879 | * we've allocated so far. | 938 | * we've allocated so far. |
880 | */ | 939 | */ |
881 | spin_lock(&hugetlb_lock); | ||
882 | needed = 0; | ||
883 | goto free; | 940 | goto free; |
884 | } | ||
885 | 941 | ||
886 | list_add(&page->lru, &surplus_list); | 942 | list_add(&page->lru, &surplus_list); |
887 | } | 943 | } |
@@ -908,31 +964,31 @@ retry: | |||
908 | needed += allocated; | 964 | needed += allocated; |
909 | h->resv_huge_pages += delta; | 965 | h->resv_huge_pages += delta; |
910 | ret = 0; | 966 | ret = 0; |
911 | free: | 967 | |
968 | spin_unlock(&hugetlb_lock); | ||
912 | /* Free the needed pages to the hugetlb pool */ | 969 | /* Free the needed pages to the hugetlb pool */ |
913 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 970 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
914 | if ((--needed) < 0) | 971 | if ((--needed) < 0) |
915 | break; | 972 | break; |
916 | list_del(&page->lru); | 973 | list_del(&page->lru); |
974 | /* | ||
975 | * This page is now managed by the hugetlb allocator and has | ||
976 | * no users -- drop the buddy allocator's reference. | ||
977 | */ | ||
978 | put_page_testzero(page); | ||
979 | VM_BUG_ON(page_count(page)); | ||
917 | enqueue_huge_page(h, page); | 980 | enqueue_huge_page(h, page); |
918 | } | 981 | } |
919 | 982 | ||
920 | /* Free unnecessary surplus pages to the buddy allocator */ | 983 | /* Free unnecessary surplus pages to the buddy allocator */ |
984 | free: | ||
921 | if (!list_empty(&surplus_list)) { | 985 | if (!list_empty(&surplus_list)) { |
922 | spin_unlock(&hugetlb_lock); | ||
923 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 986 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
924 | list_del(&page->lru); | 987 | list_del(&page->lru); |
925 | /* | 988 | put_page(page); |
926 | * The page has a reference count of zero already, so | ||
927 | * call free_huge_page directly instead of using | ||
928 | * put_page. This must be done with hugetlb_lock | ||
929 | * unlocked which is safe because free_huge_page takes | ||
930 | * hugetlb_lock before deciding how to free the page. | ||
931 | */ | ||
932 | free_huge_page(page); | ||
933 | } | 989 | } |
934 | spin_lock(&hugetlb_lock); | ||
935 | } | 990 | } |
991 | spin_lock(&hugetlb_lock); | ||
936 | 992 | ||
937 | return ret; | 993 | return ret; |
938 | } | 994 | } |
@@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1052 | spin_unlock(&hugetlb_lock); | 1108 | spin_unlock(&hugetlb_lock); |
1053 | 1109 | ||
1054 | if (!page) { | 1110 | if (!page) { |
1055 | page = alloc_buddy_huge_page(h, vma, addr); | 1111 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1056 | if (!page) { | 1112 | if (!page) { |
1057 | hugetlb_put_quota(inode->i_mapping, chg); | 1113 | hugetlb_put_quota(inode->i_mapping, chg); |
1058 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1114 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1059 | } | 1115 | } |
1060 | } | 1116 | } |
1061 | 1117 | ||
1062 | set_page_refcounted(page); | ||
1063 | set_page_private(page, (unsigned long) mapping); | 1118 | set_page_private(page, (unsigned long) mapping); |
1064 | 1119 | ||
1065 | vma_commit_reservation(h, vma, addr); | 1120 | vma_commit_reservation(h, vma, addr); |
@@ -2153,6 +2208,19 @@ nomem: | |||
2153 | return -ENOMEM; | 2208 | return -ENOMEM; |
2154 | } | 2209 | } |
2155 | 2210 | ||
2211 | static int is_hugetlb_entry_migration(pte_t pte) | ||
2212 | { | ||
2213 | swp_entry_t swp; | ||
2214 | |||
2215 | if (huge_pte_none(pte) || pte_present(pte)) | ||
2216 | return 0; | ||
2217 | swp = pte_to_swp_entry(pte); | ||
2218 | if (non_swap_entry(swp) && is_migration_entry(swp)) { | ||
2219 | return 1; | ||
2220 | } else | ||
2221 | return 0; | ||
2222 | } | ||
2223 | |||
2156 | static int is_hugetlb_entry_hwpoisoned(pte_t pte) | 2224 | static int is_hugetlb_entry_hwpoisoned(pte_t pte) |
2157 | { | 2225 | { |
2158 | swp_entry_t swp; | 2226 | swp_entry_t swp; |
@@ -2383,7 +2451,7 @@ retry_avoidcopy: | |||
2383 | if (unlikely(anon_vma_prepare(vma))) | 2451 | if (unlikely(anon_vma_prepare(vma))) |
2384 | return VM_FAULT_OOM; | 2452 | return VM_FAULT_OOM; |
2385 | 2453 | ||
2386 | copy_huge_page(new_page, old_page, address, vma); | 2454 | copy_user_huge_page(new_page, old_page, address, vma); |
2387 | __SetPageUptodate(new_page); | 2455 | __SetPageUptodate(new_page); |
2388 | 2456 | ||
2389 | /* | 2457 | /* |
@@ -2515,22 +2583,20 @@ retry: | |||
2515 | hugepage_add_new_anon_rmap(page, vma, address); | 2583 | hugepage_add_new_anon_rmap(page, vma, address); |
2516 | } | 2584 | } |
2517 | } else { | 2585 | } else { |
2586 | /* | ||
2587 | * If memory error occurs between mmap() and fault, some process | ||
2588 | * don't have hwpoisoned swap entry for errored virtual address. | ||
2589 | * So we need to block hugepage fault by PG_hwpoison bit check. | ||
2590 | */ | ||
2591 | if (unlikely(PageHWPoison(page))) { | ||
2592 | ret = VM_FAULT_HWPOISON | | ||
2593 | VM_FAULT_SET_HINDEX(h - hstates); | ||
2594 | goto backout_unlocked; | ||
2595 | } | ||
2518 | page_dup_rmap(page); | 2596 | page_dup_rmap(page); |
2519 | } | 2597 | } |
2520 | 2598 | ||
2521 | /* | 2599 | /* |
2522 | * Since memory error handler replaces pte into hwpoison swap entry | ||
2523 | * at the time of error handling, a process which reserved but not have | ||
2524 | * the mapping to the error hugepage does not have hwpoison swap entry. | ||
2525 | * So we need to block accesses from such a process by checking | ||
2526 | * PG_hwpoison bit here. | ||
2527 | */ | ||
2528 | if (unlikely(PageHWPoison(page))) { | ||
2529 | ret = VM_FAULT_HWPOISON; | ||
2530 | goto backout_unlocked; | ||
2531 | } | ||
2532 | |||
2533 | /* | ||
2534 | * If we are going to COW a private mapping later, we examine the | 2600 | * If we are going to COW a private mapping later, we examine the |
2535 | * pending reservations for this page now. This will ensure that | 2601 | * pending reservations for this page now. This will ensure that |
2536 | * any allocations necessary to record that reservation occur outside | 2602 | * any allocations necessary to record that reservation occur outside |
@@ -2587,8 +2653,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2587 | ptep = huge_pte_offset(mm, address); | 2653 | ptep = huge_pte_offset(mm, address); |
2588 | if (ptep) { | 2654 | if (ptep) { |
2589 | entry = huge_ptep_get(ptep); | 2655 | entry = huge_ptep_get(ptep); |
2590 | if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 2656 | if (unlikely(is_hugetlb_entry_migration(entry))) { |
2591 | return VM_FAULT_HWPOISON; | 2657 | migration_entry_wait(mm, (pmd_t *)ptep, address); |
2658 | return 0; | ||
2659 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | ||
2660 | return VM_FAULT_HWPOISON_LARGE | | ||
2661 | VM_FAULT_SET_HINDEX(h - hstates); | ||
2592 | } | 2662 | } |
2593 | 2663 | ||
2594 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); | 2664 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
@@ -2878,18 +2948,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
2878 | hugetlb_acct_memory(h, -(chg - freed)); | 2948 | hugetlb_acct_memory(h, -(chg - freed)); |
2879 | } | 2949 | } |
2880 | 2950 | ||
2951 | #ifdef CONFIG_MEMORY_FAILURE | ||
2952 | |||
2953 | /* Should be called in hugetlb_lock */ | ||
2954 | static int is_hugepage_on_freelist(struct page *hpage) | ||
2955 | { | ||
2956 | struct page *page; | ||
2957 | struct page *tmp; | ||
2958 | struct hstate *h = page_hstate(hpage); | ||
2959 | int nid = page_to_nid(hpage); | ||
2960 | |||
2961 | list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) | ||
2962 | if (page == hpage) | ||
2963 | return 1; | ||
2964 | return 0; | ||
2965 | } | ||
2966 | |||
2881 | /* | 2967 | /* |
2882 | * This function is called from memory failure code. | 2968 | * This function is called from memory failure code. |
2883 | * Assume the caller holds page lock of the head page. | 2969 | * Assume the caller holds page lock of the head page. |
2884 | */ | 2970 | */ |
2885 | void __isolate_hwpoisoned_huge_page(struct page *hpage) | 2971 | int dequeue_hwpoisoned_huge_page(struct page *hpage) |
2886 | { | 2972 | { |
2887 | struct hstate *h = page_hstate(hpage); | 2973 | struct hstate *h = page_hstate(hpage); |
2888 | int nid = page_to_nid(hpage); | 2974 | int nid = page_to_nid(hpage); |
2975 | int ret = -EBUSY; | ||
2889 | 2976 | ||
2890 | spin_lock(&hugetlb_lock); | 2977 | spin_lock(&hugetlb_lock); |
2891 | list_del(&hpage->lru); | 2978 | if (is_hugepage_on_freelist(hpage)) { |
2892 | h->free_huge_pages--; | 2979 | list_del(&hpage->lru); |
2893 | h->free_huge_pages_node[nid]--; | 2980 | set_page_refcounted(hpage); |
2981 | h->free_huge_pages--; | ||
2982 | h->free_huge_pages_node[nid]--; | ||
2983 | ret = 0; | ||
2984 | } | ||
2894 | spin_unlock(&hugetlb_lock); | 2985 | spin_unlock(&hugetlb_lock); |
2986 | return ret; | ||
2895 | } | 2987 | } |
2988 | #endif | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 757f6b0accfe..44a8cefeae6e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -7,21 +7,26 @@ | |||
7 | * Free Software Foundation. | 7 | * Free Software Foundation. |
8 | * | 8 | * |
9 | * High level machine check handler. Handles pages reported by the | 9 | * High level machine check handler. Handles pages reported by the |
10 | * hardware as being corrupted usually due to a 2bit ECC memory or cache | 10 | * hardware as being corrupted usually due to a multi-bit ECC memory or cache |
11 | * failure. | 11 | * failure. |
12 | * | ||
13 | * In addition there is a "soft offline" entry point that allows stop using | ||
14 | * not-yet-corrupted-by-suspicious pages without killing anything. | ||
12 | * | 15 | * |
13 | * Handles page cache pages in various states. The tricky part | 16 | * Handles page cache pages in various states. The tricky part |
14 | * here is that we can access any page asynchronous to other VM | 17 | * here is that we can access any page asynchronously in respect to |
15 | * users, because memory failures could happen anytime and anywhere, | 18 | * other VM users, because memory failures could happen anytime and |
16 | * possibly violating some of their assumptions. This is why this code | 19 | * anywhere. This could violate some of their assumptions. This is why |
17 | * has to be extremely careful. Generally it tries to use normal locking | 20 | * this code has to be extremely careful. Generally it tries to use |
18 | * rules, as in get the standard locks, even if that means the | 21 | * normal locking rules, as in get the standard locks, even if that means |
19 | * error handling takes potentially a long time. | 22 | * the error handling takes potentially a long time. |
20 | * | 23 | * |
21 | * The operation to map back from RMAP chains to processes has to walk | 24 | * There are several operations here with exponential complexity because |
22 | * the complete process list and has non linear complexity with the number | 25 | * of unsuitable VM data structures. For example the operation to map back |
23 | * mappings. In short it can be quite slow. But since memory corruptions | 26 | * from RMAP chains to processes has to walk the complete process list and |
24 | * are rare we hope to get away with this. | 27 | * has non linear complexity with the number. But since memory corruptions |
28 | * are rare we hope to get away with this. This avoids impacting the core | ||
29 | * VM. | ||
25 | */ | 30 | */ |
26 | 31 | ||
27 | /* | 32 | /* |
@@ -30,7 +35,6 @@ | |||
30 | * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages | 35 | * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages |
31 | * - pass bad pages to kdump next kernel | 36 | * - pass bad pages to kdump next kernel |
32 | */ | 37 | */ |
33 | #define DEBUG 1 /* remove me in 2.6.34 */ | ||
34 | #include <linux/kernel.h> | 38 | #include <linux/kernel.h> |
35 | #include <linux/mm.h> | 39 | #include <linux/mm.h> |
36 | #include <linux/page-flags.h> | 40 | #include <linux/page-flags.h> |
@@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p) | |||
78 | return 0; | 82 | return 0; |
79 | 83 | ||
80 | /* | 84 | /* |
81 | * page_mapping() does not accept slab page | 85 | * page_mapping() does not accept slab pages. |
82 | */ | 86 | */ |
83 | if (PageSlab(p)) | 87 | if (PageSlab(p)) |
84 | return -EINVAL; | 88 | return -EINVAL; |
@@ -268,7 +272,7 @@ struct to_kill { | |||
268 | struct list_head nd; | 272 | struct list_head nd; |
269 | struct task_struct *tsk; | 273 | struct task_struct *tsk; |
270 | unsigned long addr; | 274 | unsigned long addr; |
271 | unsigned addr_valid:1; | 275 | char addr_valid; |
272 | }; | 276 | }; |
273 | 277 | ||
274 | /* | 278 | /* |
@@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, | |||
309 | * a SIGKILL because the error is not contained anymore. | 313 | * a SIGKILL because the error is not contained anymore. |
310 | */ | 314 | */ |
311 | if (tk->addr == -EFAULT) { | 315 | if (tk->addr == -EFAULT) { |
312 | pr_debug("MCE: Unable to find user space address %lx in %s\n", | 316 | pr_info("MCE: Unable to find user space address %lx in %s\n", |
313 | page_to_pfn(p), tsk->comm); | 317 | page_to_pfn(p), tsk->comm); |
314 | tk->addr_valid = 0; | 318 | tk->addr_valid = 0; |
315 | } | 319 | } |
@@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
577 | pfn, err); | 581 | pfn, err); |
578 | } else if (page_has_private(p) && | 582 | } else if (page_has_private(p) && |
579 | !try_to_release_page(p, GFP_NOIO)) { | 583 | !try_to_release_page(p, GFP_NOIO)) { |
580 | pr_debug("MCE %#lx: failed to release buffers\n", pfn); | 584 | pr_info("MCE %#lx: failed to release buffers\n", pfn); |
581 | } else { | 585 | } else { |
582 | ret = RECOVERED; | 586 | ret = RECOVERED; |
583 | } | 587 | } |
@@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) | |||
693 | * Issues: | 697 | * Issues: |
694 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) | 698 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) |
695 | * To narrow down kill region to one page, we need to break up pmd. | 699 | * To narrow down kill region to one page, we need to break up pmd. |
696 | * - To support soft-offlining for hugepage, we need to support hugepage | ||
697 | * migration. | ||
698 | */ | 700 | */ |
699 | static int me_huge_page(struct page *p, unsigned long pfn) | 701 | static int me_huge_page(struct page *p, unsigned long pfn) |
700 | { | 702 | { |
703 | int res = 0; | ||
701 | struct page *hpage = compound_head(p); | 704 | struct page *hpage = compound_head(p); |
702 | /* | 705 | /* |
703 | * We can safely recover from error on free or reserved (i.e. | 706 | * We can safely recover from error on free or reserved (i.e. |
@@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
710 | * so there is no race between isolation and mapping/unmapping. | 713 | * so there is no race between isolation and mapping/unmapping. |
711 | */ | 714 | */ |
712 | if (!(page_mapping(hpage) || PageAnon(hpage))) { | 715 | if (!(page_mapping(hpage) || PageAnon(hpage))) { |
713 | __isolate_hwpoisoned_huge_page(hpage); | 716 | res = dequeue_hwpoisoned_huge_page(hpage); |
714 | return RECOVERED; | 717 | if (!res) |
718 | return RECOVERED; | ||
715 | } | 719 | } |
716 | return DELAYED; | 720 | return DELAYED; |
717 | } | 721 | } |
@@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p, | |||
836 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; | 840 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; |
837 | } | 841 | } |
838 | 842 | ||
839 | #define N_UNMAP_TRIES 5 | ||
840 | |||
841 | /* | 843 | /* |
842 | * Do all that is necessary to remove user space mappings. Unmap | 844 | * Do all that is necessary to remove user space mappings. Unmap |
843 | * the pages and send SIGBUS to the processes if the data was dirty. | 845 | * the pages and send SIGBUS to the processes if the data was dirty. |
@@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
849 | struct address_space *mapping; | 851 | struct address_space *mapping; |
850 | LIST_HEAD(tokill); | 852 | LIST_HEAD(tokill); |
851 | int ret; | 853 | int ret; |
852 | int i; | ||
853 | int kill = 1; | 854 | int kill = 1; |
854 | struct page *hpage = compound_head(p); | 855 | struct page *hpage = compound_head(p); |
855 | 856 | ||
@@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
903 | if (kill) | 904 | if (kill) |
904 | collect_procs(hpage, &tokill); | 905 | collect_procs(hpage, &tokill); |
905 | 906 | ||
906 | /* | 907 | ret = try_to_unmap(hpage, ttu); |
907 | * try_to_unmap can fail temporarily due to races. | ||
908 | * Try a few times (RED-PEN better strategy?) | ||
909 | */ | ||
910 | for (i = 0; i < N_UNMAP_TRIES; i++) { | ||
911 | ret = try_to_unmap(hpage, ttu); | ||
912 | if (ret == SWAP_SUCCESS) | ||
913 | break; | ||
914 | pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); | ||
915 | } | ||
916 | |||
917 | if (ret != SWAP_SUCCESS) | 908 | if (ret != SWAP_SUCCESS) |
918 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | 909 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", |
919 | pfn, page_mapcount(hpage)); | 910 | pfn, page_mapcount(hpage)); |
@@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
981 | * We need/can do nothing about count=0 pages. | 972 | * We need/can do nothing about count=0 pages. |
982 | * 1) it's a free page, and therefore in safe hand: | 973 | * 1) it's a free page, and therefore in safe hand: |
983 | * prep_new_page() will be the gate keeper. | 974 | * prep_new_page() will be the gate keeper. |
984 | * 2) it's part of a non-compound high order page. | 975 | * 2) it's a free hugepage, which is also safe: |
976 | * an affected hugepage will be dequeued from hugepage freelist, | ||
977 | * so there's no concern about reusing it ever after. | ||
978 | * 3) it's part of a non-compound high order page. | ||
985 | * Implies some kernel user: cannot stop them from | 979 | * Implies some kernel user: cannot stop them from |
986 | * R/W the page; let's pray that the page has been | 980 | * R/W the page; let's pray that the page has been |
987 | * used and will be freed some time later. | 981 | * used and will be freed some time later. |
@@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
993 | if (is_free_buddy_page(p)) { | 987 | if (is_free_buddy_page(p)) { |
994 | action_result(pfn, "free buddy", DELAYED); | 988 | action_result(pfn, "free buddy", DELAYED); |
995 | return 0; | 989 | return 0; |
990 | } else if (PageHuge(hpage)) { | ||
991 | /* | ||
992 | * Check "just unpoisoned", "filter hit", and | ||
993 | * "race with other subpage." | ||
994 | */ | ||
995 | lock_page_nosync(hpage); | ||
996 | if (!PageHWPoison(hpage) | ||
997 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) | ||
998 | || (p != hpage && TestSetPageHWPoison(hpage))) { | ||
999 | atomic_long_sub(nr_pages, &mce_bad_pages); | ||
1000 | return 0; | ||
1001 | } | ||
1002 | set_page_hwpoison_huge_page(hpage); | ||
1003 | res = dequeue_hwpoisoned_huge_page(hpage); | ||
1004 | action_result(pfn, "free huge", | ||
1005 | res ? IGNORED : DELAYED); | ||
1006 | unlock_page(hpage); | ||
1007 | return res; | ||
996 | } else { | 1008 | } else { |
997 | action_result(pfn, "high order kernel", IGNORED); | 1009 | action_result(pfn, "high order kernel", IGNORED); |
998 | return -EBUSY; | 1010 | return -EBUSY; |
@@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn) | |||
1147 | page = compound_head(p); | 1159 | page = compound_head(p); |
1148 | 1160 | ||
1149 | if (!PageHWPoison(p)) { | 1161 | if (!PageHWPoison(p)) { |
1150 | pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); | 1162 | pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); |
1151 | return 0; | 1163 | return 0; |
1152 | } | 1164 | } |
1153 | 1165 | ||
1154 | nr_pages = 1 << compound_order(page); | 1166 | nr_pages = 1 << compound_order(page); |
1155 | 1167 | ||
1156 | if (!get_page_unless_zero(page)) { | 1168 | if (!get_page_unless_zero(page)) { |
1169 | /* | ||
1170 | * Since HWPoisoned hugepage should have non-zero refcount, | ||
1171 | * race between memory failure and unpoison seems to happen. | ||
1172 | * In such case unpoison fails and memory failure runs | ||
1173 | * to the end. | ||
1174 | */ | ||
1175 | if (PageHuge(page)) { | ||
1176 | pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); | ||
1177 | return 0; | ||
1178 | } | ||
1157 | if (TestClearPageHWPoison(p)) | 1179 | if (TestClearPageHWPoison(p)) |
1158 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1180 | atomic_long_sub(nr_pages, &mce_bad_pages); |
1159 | pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); | 1181 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); |
1160 | return 0; | 1182 | return 0; |
1161 | } | 1183 | } |
1162 | 1184 | ||
@@ -1168,12 +1190,12 @@ int unpoison_memory(unsigned long pfn) | |||
1168 | * the free buddy page pool. | 1190 | * the free buddy page pool. |
1169 | */ | 1191 | */ |
1170 | if (TestClearPageHWPoison(page)) { | 1192 | if (TestClearPageHWPoison(page)) { |
1171 | pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); | 1193 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); |
1172 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1194 | atomic_long_sub(nr_pages, &mce_bad_pages); |
1173 | freeit = 1; | 1195 | freeit = 1; |
1196 | if (PageHuge(page)) | ||
1197 | clear_page_hwpoison_huge_page(page); | ||
1174 | } | 1198 | } |
1175 | if (PageHuge(p)) | ||
1176 | clear_page_hwpoison_huge_page(page); | ||
1177 | unlock_page(page); | 1199 | unlock_page(page); |
1178 | 1200 | ||
1179 | put_page(page); | 1201 | put_page(page); |
@@ -1187,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory); | |||
1187 | static struct page *new_page(struct page *p, unsigned long private, int **x) | 1209 | static struct page *new_page(struct page *p, unsigned long private, int **x) |
1188 | { | 1210 | { |
1189 | int nid = page_to_nid(p); | 1211 | int nid = page_to_nid(p); |
1190 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | 1212 | if (PageHuge(p)) |
1213 | return alloc_huge_page_node(page_hstate(compound_head(p)), | ||
1214 | nid); | ||
1215 | else | ||
1216 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | ||
1191 | } | 1217 | } |
1192 | 1218 | ||
1193 | /* | 1219 | /* |
@@ -1215,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1215 | * was free. | 1241 | * was free. |
1216 | */ | 1242 | */ |
1217 | set_migratetype_isolate(p); | 1243 | set_migratetype_isolate(p); |
1244 | /* | ||
1245 | * When the target page is a free hugepage, just remove it | ||
1246 | * from free hugepage list. | ||
1247 | */ | ||
1218 | if (!get_page_unless_zero(compound_head(p))) { | 1248 | if (!get_page_unless_zero(compound_head(p))) { |
1219 | if (is_free_buddy_page(p)) { | 1249 | if (PageHuge(p)) { |
1220 | pr_debug("get_any_page: %#lx free buddy page\n", pfn); | 1250 | pr_info("get_any_page: %#lx free huge page\n", pfn); |
1251 | ret = dequeue_hwpoisoned_huge_page(compound_head(p)); | ||
1252 | } else if (is_free_buddy_page(p)) { | ||
1253 | pr_info("get_any_page: %#lx free buddy page\n", pfn); | ||
1221 | /* Set hwpoison bit while page is still isolated */ | 1254 | /* Set hwpoison bit while page is still isolated */ |
1222 | SetPageHWPoison(p); | 1255 | SetPageHWPoison(p); |
1223 | ret = 0; | 1256 | ret = 0; |
1224 | } else { | 1257 | } else { |
1225 | pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", | 1258 | pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", |
1226 | pfn, p->flags); | 1259 | pfn, p->flags); |
1227 | ret = -EIO; | 1260 | ret = -EIO; |
1228 | } | 1261 | } |
@@ -1235,6 +1268,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1235 | return ret; | 1268 | return ret; |
1236 | } | 1269 | } |
1237 | 1270 | ||
1271 | static int soft_offline_huge_page(struct page *page, int flags) | ||
1272 | { | ||
1273 | int ret; | ||
1274 | unsigned long pfn = page_to_pfn(page); | ||
1275 | struct page *hpage = compound_head(page); | ||
1276 | LIST_HEAD(pagelist); | ||
1277 | |||
1278 | ret = get_any_page(page, pfn, flags); | ||
1279 | if (ret < 0) | ||
1280 | return ret; | ||
1281 | if (ret == 0) | ||
1282 | goto done; | ||
1283 | |||
1284 | if (PageHWPoison(hpage)) { | ||
1285 | put_page(hpage); | ||
1286 | pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); | ||
1287 | return -EBUSY; | ||
1288 | } | ||
1289 | |||
1290 | /* Keep page count to indicate a given hugepage is isolated. */ | ||
1291 | |||
1292 | list_add(&hpage->lru, &pagelist); | ||
1293 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | ||
1294 | if (ret) { | ||
1295 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | ||
1296 | pfn, ret, page->flags); | ||
1297 | if (ret > 0) | ||
1298 | ret = -EIO; | ||
1299 | return ret; | ||
1300 | } | ||
1301 | done: | ||
1302 | if (!PageHWPoison(hpage)) | ||
1303 | atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); | ||
1304 | set_page_hwpoison_huge_page(hpage); | ||
1305 | dequeue_hwpoisoned_huge_page(hpage); | ||
1306 | /* keep elevated page count for bad page */ | ||
1307 | return ret; | ||
1308 | } | ||
1309 | |||
1238 | /** | 1310 | /** |
1239 | * soft_offline_page - Soft offline a page. | 1311 | * soft_offline_page - Soft offline a page. |
1240 | * @page: page to offline | 1312 | * @page: page to offline |
@@ -1262,6 +1334,9 @@ int soft_offline_page(struct page *page, int flags) | |||
1262 | int ret; | 1334 | int ret; |
1263 | unsigned long pfn = page_to_pfn(page); | 1335 | unsigned long pfn = page_to_pfn(page); |
1264 | 1336 | ||
1337 | if (PageHuge(page)) | ||
1338 | return soft_offline_huge_page(page, flags); | ||
1339 | |||
1265 | ret = get_any_page(page, pfn, flags); | 1340 | ret = get_any_page(page, pfn, flags); |
1266 | if (ret < 0) | 1341 | if (ret < 0) |
1267 | return ret; | 1342 | return ret; |
@@ -1288,7 +1363,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1288 | goto done; | 1363 | goto done; |
1289 | } | 1364 | } |
1290 | if (!PageLRU(page)) { | 1365 | if (!PageLRU(page)) { |
1291 | pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", | 1366 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", |
1292 | pfn, page->flags); | 1367 | pfn, page->flags); |
1293 | return -EIO; | 1368 | return -EIO; |
1294 | } | 1369 | } |
@@ -1302,7 +1377,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1302 | if (PageHWPoison(page)) { | 1377 | if (PageHWPoison(page)) { |
1303 | unlock_page(page); | 1378 | unlock_page(page); |
1304 | put_page(page); | 1379 | put_page(page); |
1305 | pr_debug("soft offline: %#lx page already poisoned\n", pfn); | 1380 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
1306 | return -EBUSY; | 1381 | return -EBUSY; |
1307 | } | 1382 | } |
1308 | 1383 | ||
@@ -1323,7 +1398,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1323 | put_page(page); | 1398 | put_page(page); |
1324 | if (ret == 1) { | 1399 | if (ret == 1) { |
1325 | ret = 0; | 1400 | ret = 0; |
1326 | pr_debug("soft_offline: %#lx: invalidated\n", pfn); | 1401 | pr_info("soft_offline: %#lx: invalidated\n", pfn); |
1327 | goto done; | 1402 | goto done; |
1328 | } | 1403 | } |
1329 | 1404 | ||
@@ -1339,13 +1414,13 @@ int soft_offline_page(struct page *page, int flags) | |||
1339 | list_add(&page->lru, &pagelist); | 1414 | list_add(&page->lru, &pagelist); |
1340 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | 1415 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); |
1341 | if (ret) { | 1416 | if (ret) { |
1342 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | 1417 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1343 | pfn, ret, page->flags); | 1418 | pfn, ret, page->flags); |
1344 | if (ret > 0) | 1419 | if (ret > 0) |
1345 | ret = -EIO; | 1420 | ret = -EIO; |
1346 | } | 1421 | } |
1347 | } else { | 1422 | } else { |
1348 | pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | 1423 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", |
1349 | pfn, ret, page_count(page), page->flags); | 1424 | pfn, ret, page_count(page), page->flags); |
1350 | } | 1425 | } |
1351 | if (ret) | 1426 | if (ret) |
diff --git a/mm/memory.c b/mm/memory.c index 98b58fecedef..af82741caaa4 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1450 | if (ret & VM_FAULT_OOM) | 1450 | if (ret & VM_FAULT_OOM) |
1451 | return i ? i : -ENOMEM; | 1451 | return i ? i : -ENOMEM; |
1452 | if (ret & | 1452 | if (ret & |
1453 | (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) | 1453 | (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| |
1454 | VM_FAULT_SIGBUS)) | ||
1454 | return i ? i : -EFAULT; | 1455 | return i ? i : -EFAULT; |
1455 | BUG(); | 1456 | BUG(); |
1456 | } | 1457 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 38e7cad782f4..f8c9bccf2520 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include <linux/memcontrol.h> | 33 | #include <linux/memcontrol.h> |
34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
35 | #include <linux/hugetlb.h> | ||
35 | #include <linux/gfp.h> | 36 | #include <linux/gfp.h> |
36 | 37 | ||
37 | #include "internal.h" | 38 | #include "internal.h" |
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
95 | pte_t *ptep, pte; | 96 | pte_t *ptep, pte; |
96 | spinlock_t *ptl; | 97 | spinlock_t *ptl; |
97 | 98 | ||
98 | pgd = pgd_offset(mm, addr); | 99 | if (unlikely(PageHuge(new))) { |
99 | if (!pgd_present(*pgd)) | 100 | ptep = huge_pte_offset(mm, addr); |
100 | goto out; | 101 | if (!ptep) |
102 | goto out; | ||
103 | ptl = &mm->page_table_lock; | ||
104 | } else { | ||
105 | pgd = pgd_offset(mm, addr); | ||
106 | if (!pgd_present(*pgd)) | ||
107 | goto out; | ||
101 | 108 | ||
102 | pud = pud_offset(pgd, addr); | 109 | pud = pud_offset(pgd, addr); |
103 | if (!pud_present(*pud)) | 110 | if (!pud_present(*pud)) |
104 | goto out; | 111 | goto out; |
105 | 112 | ||
106 | pmd = pmd_offset(pud, addr); | 113 | pmd = pmd_offset(pud, addr); |
107 | if (!pmd_present(*pmd)) | 114 | if (!pmd_present(*pmd)) |
108 | goto out; | 115 | goto out; |
109 | 116 | ||
110 | ptep = pte_offset_map(pmd, addr); | 117 | ptep = pte_offset_map(pmd, addr); |
111 | 118 | ||
112 | if (!is_swap_pte(*ptep)) { | 119 | if (!is_swap_pte(*ptep)) { |
113 | pte_unmap(ptep); | 120 | pte_unmap(ptep); |
114 | goto out; | 121 | goto out; |
115 | } | 122 | } |
123 | |||
124 | ptl = pte_lockptr(mm, pmd); | ||
125 | } | ||
116 | 126 | ||
117 | ptl = pte_lockptr(mm, pmd); | ||
118 | spin_lock(ptl); | 127 | spin_lock(ptl); |
119 | pte = *ptep; | 128 | pte = *ptep; |
120 | if (!is_swap_pte(pte)) | 129 | if (!is_swap_pte(pte)) |
@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
130 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 139 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
131 | if (is_write_migration_entry(entry)) | 140 | if (is_write_migration_entry(entry)) |
132 | pte = pte_mkwrite(pte); | 141 | pte = pte_mkwrite(pte); |
142 | #ifdef CONFIG_HUGETLB_PAGE | ||
143 | if (PageHuge(new)) | ||
144 | pte = pte_mkhuge(pte); | ||
145 | #endif | ||
133 | flush_cache_page(vma, addr, pte_pfn(pte)); | 146 | flush_cache_page(vma, addr, pte_pfn(pte)); |
134 | set_pte_at(mm, addr, ptep, pte); | 147 | set_pte_at(mm, addr, ptep, pte); |
135 | 148 | ||
136 | if (PageAnon(new)) | 149 | if (PageHuge(new)) { |
150 | if (PageAnon(new)) | ||
151 | hugepage_add_anon_rmap(new, vma, addr); | ||
152 | else | ||
153 | page_dup_rmap(new); | ||
154 | } else if (PageAnon(new)) | ||
137 | page_add_anon_rmap(new, vma, addr); | 155 | page_add_anon_rmap(new, vma, addr); |
138 | else | 156 | else |
139 | page_add_file_rmap(new); | 157 | page_add_file_rmap(new); |
@@ -276,11 +294,59 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
276 | } | 294 | } |
277 | 295 | ||
278 | /* | 296 | /* |
297 | * The expected number of remaining references is the same as that | ||
298 | * of migrate_page_move_mapping(). | ||
299 | */ | ||
300 | int migrate_huge_page_move_mapping(struct address_space *mapping, | ||
301 | struct page *newpage, struct page *page) | ||
302 | { | ||
303 | int expected_count; | ||
304 | void **pslot; | ||
305 | |||
306 | if (!mapping) { | ||
307 | if (page_count(page) != 1) | ||
308 | return -EAGAIN; | ||
309 | return 0; | ||
310 | } | ||
311 | |||
312 | spin_lock_irq(&mapping->tree_lock); | ||
313 | |||
314 | pslot = radix_tree_lookup_slot(&mapping->page_tree, | ||
315 | page_index(page)); | ||
316 | |||
317 | expected_count = 2 + page_has_private(page); | ||
318 | if (page_count(page) != expected_count || | ||
319 | (struct page *)radix_tree_deref_slot(pslot) != page) { | ||
320 | spin_unlock_irq(&mapping->tree_lock); | ||
321 | return -EAGAIN; | ||
322 | } | ||
323 | |||
324 | if (!page_freeze_refs(page, expected_count)) { | ||
325 | spin_unlock_irq(&mapping->tree_lock); | ||
326 | return -EAGAIN; | ||
327 | } | ||
328 | |||
329 | get_page(newpage); | ||
330 | |||
331 | radix_tree_replace_slot(pslot, newpage); | ||
332 | |||
333 | page_unfreeze_refs(page, expected_count); | ||
334 | |||
335 | __put_page(page); | ||
336 | |||
337 | spin_unlock_irq(&mapping->tree_lock); | ||
338 | return 0; | ||
339 | } | ||
340 | |||
341 | /* | ||
279 | * Copy the page to its new location | 342 | * Copy the page to its new location |
280 | */ | 343 | */ |
281 | static void migrate_page_copy(struct page *newpage, struct page *page) | 344 | void migrate_page_copy(struct page *newpage, struct page *page) |
282 | { | 345 | { |
283 | copy_highpage(newpage, page); | 346 | if (PageHuge(page)) |
347 | copy_huge_page(newpage, page); | ||
348 | else | ||
349 | copy_highpage(newpage, page); | ||
284 | 350 | ||
285 | if (PageError(page)) | 351 | if (PageError(page)) |
286 | SetPageError(newpage); | 352 | SetPageError(newpage); |
@@ -724,6 +790,92 @@ move_newpage: | |||
724 | } | 790 | } |
725 | 791 | ||
726 | /* | 792 | /* |
793 | * Counterpart of unmap_and_move_page() for hugepage migration. | ||
794 | * | ||
795 | * This function doesn't wait the completion of hugepage I/O | ||
796 | * because there is no race between I/O and migration for hugepage. | ||
797 | * Note that currently hugepage I/O occurs only in direct I/O | ||
798 | * where no lock is held and PG_writeback is irrelevant, | ||
799 | * and writeback status of all subpages are counted in the reference | ||
800 | * count of the head page (i.e. if all subpages of a 2MB hugepage are | ||
801 | * under direct I/O, the reference of the head page is 512 and a bit more.) | ||
802 | * This means that when we try to migrate hugepage whose subpages are | ||
803 | * doing direct I/O, some references remain after try_to_unmap() and | ||
804 | * hugepage migration fails without data corruption. | ||
805 | * | ||
806 | * There is also no race when direct I/O is issued on the page under migration, | ||
807 | * because then pte is replaced with migration swap entry and direct I/O code | ||
808 | * will wait in the page fault for migration to complete. | ||
809 | */ | ||
810 | static int unmap_and_move_huge_page(new_page_t get_new_page, | ||
811 | unsigned long private, struct page *hpage, | ||
812 | int force, int offlining) | ||
813 | { | ||
814 | int rc = 0; | ||
815 | int *result = NULL; | ||
816 | struct page *new_hpage = get_new_page(hpage, private, &result); | ||
817 | int rcu_locked = 0; | ||
818 | struct anon_vma *anon_vma = NULL; | ||
819 | |||
820 | if (!new_hpage) | ||
821 | return -ENOMEM; | ||
822 | |||
823 | rc = -EAGAIN; | ||
824 | |||
825 | if (!trylock_page(hpage)) { | ||
826 | if (!force) | ||
827 | goto out; | ||
828 | lock_page(hpage); | ||
829 | } | ||
830 | |||
831 | if (PageAnon(hpage)) { | ||
832 | rcu_read_lock(); | ||
833 | rcu_locked = 1; | ||
834 | |||
835 | if (page_mapped(hpage)) { | ||
836 | anon_vma = page_anon_vma(hpage); | ||
837 | atomic_inc(&anon_vma->external_refcount); | ||
838 | } | ||
839 | } | ||
840 | |||
841 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | ||
842 | |||
843 | if (!page_mapped(hpage)) | ||
844 | rc = move_to_new_page(new_hpage, hpage, 1); | ||
845 | |||
846 | if (rc) | ||
847 | remove_migration_ptes(hpage, hpage); | ||
848 | |||
849 | if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, | ||
850 | &anon_vma->lock)) { | ||
851 | int empty = list_empty(&anon_vma->head); | ||
852 | spin_unlock(&anon_vma->lock); | ||
853 | if (empty) | ||
854 | anon_vma_free(anon_vma); | ||
855 | } | ||
856 | |||
857 | if (rcu_locked) | ||
858 | rcu_read_unlock(); | ||
859 | out: | ||
860 | unlock_page(hpage); | ||
861 | |||
862 | if (rc != -EAGAIN) { | ||
863 | list_del(&hpage->lru); | ||
864 | put_page(hpage); | ||
865 | } | ||
866 | |||
867 | put_page(new_hpage); | ||
868 | |||
869 | if (result) { | ||
870 | if (rc) | ||
871 | *result = rc; | ||
872 | else | ||
873 | *result = page_to_nid(new_hpage); | ||
874 | } | ||
875 | return rc; | ||
876 | } | ||
877 | |||
878 | /* | ||
727 | * migrate_pages | 879 | * migrate_pages |
728 | * | 880 | * |
729 | * The function takes one list of pages to migrate and a function | 881 | * The function takes one list of pages to migrate and a function |
@@ -788,6 +940,52 @@ out: | |||
788 | return nr_failed + retry; | 940 | return nr_failed + retry; |
789 | } | 941 | } |
790 | 942 | ||
943 | int migrate_huge_pages(struct list_head *from, | ||
944 | new_page_t get_new_page, unsigned long private, int offlining) | ||
945 | { | ||
946 | int retry = 1; | ||
947 | int nr_failed = 0; | ||
948 | int pass = 0; | ||
949 | struct page *page; | ||
950 | struct page *page2; | ||
951 | int rc; | ||
952 | |||
953 | for (pass = 0; pass < 10 && retry; pass++) { | ||
954 | retry = 0; | ||
955 | |||
956 | list_for_each_entry_safe(page, page2, from, lru) { | ||
957 | cond_resched(); | ||
958 | |||
959 | rc = unmap_and_move_huge_page(get_new_page, | ||
960 | private, page, pass > 2, offlining); | ||
961 | |||
962 | switch(rc) { | ||
963 | case -ENOMEM: | ||
964 | goto out; | ||
965 | case -EAGAIN: | ||
966 | retry++; | ||
967 | break; | ||
968 | case 0: | ||
969 | break; | ||
970 | default: | ||
971 | /* Permanent failure */ | ||
972 | nr_failed++; | ||
973 | break; | ||
974 | } | ||
975 | } | ||
976 | } | ||
977 | rc = 0; | ||
978 | out: | ||
979 | |||
980 | list_for_each_entry_safe(page, page2, from, lru) | ||
981 | put_page(page); | ||
982 | |||
983 | if (rc) | ||
984 | return rc; | ||
985 | |||
986 | return nr_failed + retry; | ||
987 | } | ||
988 | |||
791 | #ifdef CONFIG_NUMA | 989 | #ifdef CONFIG_NUMA |
792 | /* | 990 | /* |
793 | * Move a list of individual pages | 991 | * Move a list of individual pages |
@@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page, | |||
780 | } | 780 | } |
781 | 781 | ||
782 | /** | 782 | /** |
783 | * __page_set_anon_rmap - setup new anonymous rmap | 783 | * __page_set_anon_rmap - set up new anonymous rmap |
784 | * @page: the page to add the mapping to | 784 | * @page: Page to add to rmap |
785 | * @vma: the vm area in which the mapping is added | 785 | * @vma: VM area to add page to. |
786 | * @address: the user virtual address mapped | 786 | * @address: User virtual address of the mapping |
787 | * @exclusive: the page is exclusively owned by the current process | 787 | * @exclusive: the page is exclusively owned by the current process |
788 | */ | 788 | */ |
789 | static void __page_set_anon_rmap(struct page *page, | 789 | static void __page_set_anon_rmap(struct page *page, |
@@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page, | |||
793 | 793 | ||
794 | BUG_ON(!anon_vma); | 794 | BUG_ON(!anon_vma); |
795 | 795 | ||
796 | if (PageAnon(page)) | ||
797 | return; | ||
798 | |||
796 | /* | 799 | /* |
797 | * If the page isn't exclusively mapped into this vma, | 800 | * If the page isn't exclusively mapped into this vma, |
798 | * we must use the _oldest_ possible anon_vma for the | 801 | * we must use the _oldest_ possible anon_vma for the |
799 | * page mapping! | 802 | * page mapping! |
800 | */ | 803 | */ |
801 | if (!exclusive) { | 804 | if (!exclusive) |
802 | if (PageAnon(page)) | ||
803 | return; | ||
804 | anon_vma = anon_vma->root; | 805 | anon_vma = anon_vma->root; |
805 | } else { | ||
806 | /* | ||
807 | * In this case, swapped-out-but-not-discarded swap-cache | ||
808 | * is remapped. So, no need to update page->mapping here. | ||
809 | * We convice anon_vma poitned by page->mapping is not obsolete | ||
810 | * because vma->anon_vma is necessary to be a family of it. | ||
811 | */ | ||
812 | if (PageAnon(page)) | ||
813 | return; | ||
814 | } | ||
815 | 806 | ||
816 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 807 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
817 | page->mapping = (struct address_space *) anon_vma; | 808 | page->mapping = (struct address_space *) anon_vma; |