diff options
| -rw-r--r-- | arch/ia64/include/asm/siginfo.h | 1 | ||||
| -rw-r--r-- | arch/x86/mm/fault.c | 19 | ||||
| -rw-r--r-- | fs/hugetlbfs/inode.c | 15 | ||||
| -rw-r--r-- | fs/signalfd.c | 10 | ||||
| -rw-r--r-- | include/linux/hugetlb.h | 17 | ||||
| -rw-r--r-- | include/linux/migrate.h | 16 | ||||
| -rw-r--r-- | include/linux/mm.h | 12 | ||||
| -rw-r--r-- | include/linux/signalfd.h | 3 | ||||
| -rw-r--r-- | mm/hugetlb.c | 233 | ||||
| -rw-r--r-- | mm/memory-failure.c | 175 | ||||
| -rw-r--r-- | mm/memory.c | 3 | ||||
| -rw-r--r-- | mm/migrate.c | 234 | ||||
| -rw-r--r-- | mm/rmap.c | 25 |
13 files changed, 596 insertions, 167 deletions
diff --git a/arch/ia64/include/asm/siginfo.h b/arch/ia64/include/asm/siginfo.h index 118d42979003..c8fcaa2ac48f 100644 --- a/arch/ia64/include/asm/siginfo.h +++ b/arch/ia64/include/asm/siginfo.h | |||
| @@ -62,6 +62,7 @@ typedef struct siginfo { | |||
| 62 | int _imm; /* immediate value for "break" */ | 62 | int _imm; /* immediate value for "break" */ |
| 63 | unsigned int _flags; /* see below */ | 63 | unsigned int _flags; /* see below */ |
| 64 | unsigned long _isr; /* isr */ | 64 | unsigned long _isr; /* isr */ |
| 65 | short _addr_lsb; /* lsb of faulting address */ | ||
| 65 | } _sigfault; | 66 | } _sigfault; |
| 66 | 67 | ||
| 67 | /* SIGPOLL */ | 68 | /* SIGPOLL */ |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 79b0b372d2d0..852b319edbdc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
| @@ -11,6 +11,7 @@ | |||
| 11 | #include <linux/kprobes.h> /* __kprobes, ... */ | 11 | #include <linux/kprobes.h> /* __kprobes, ... */ |
| 12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ | 12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ |
| 13 | #include <linux/perf_event.h> /* perf_sw_event */ | 13 | #include <linux/perf_event.h> /* perf_sw_event */ |
| 14 | #include <linux/hugetlb.h> /* hstate_index_to_shift */ | ||
| 14 | 15 | ||
| 15 | #include <asm/traps.h> /* dotraplinkage, ... */ | 16 | #include <asm/traps.h> /* dotraplinkage, ... */ |
| 16 | #include <asm/pgalloc.h> /* pgd_*(), ... */ | 17 | #include <asm/pgalloc.h> /* pgd_*(), ... */ |
| @@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) | |||
| 160 | 161 | ||
| 161 | static void | 162 | static void |
| 162 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, | 163 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, |
| 163 | struct task_struct *tsk) | 164 | struct task_struct *tsk, int fault) |
| 164 | { | 165 | { |
| 166 | unsigned lsb = 0; | ||
| 165 | siginfo_t info; | 167 | siginfo_t info; |
| 166 | 168 | ||
| 167 | info.si_signo = si_signo; | 169 | info.si_signo = si_signo; |
| 168 | info.si_errno = 0; | 170 | info.si_errno = 0; |
| 169 | info.si_code = si_code; | 171 | info.si_code = si_code; |
| 170 | info.si_addr = (void __user *)address; | 172 | info.si_addr = (void __user *)address; |
| 171 | info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; | 173 | if (fault & VM_FAULT_HWPOISON_LARGE) |
| 174 | lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); | ||
| 175 | if (fault & VM_FAULT_HWPOISON) | ||
| 176 | lsb = PAGE_SHIFT; | ||
| 177 | info.si_addr_lsb = lsb; | ||
| 172 | 178 | ||
| 173 | force_sig_info(si_signo, &info, tsk); | 179 | force_sig_info(si_signo, &info, tsk); |
| 174 | } | 180 | } |
| @@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, | |||
| 722 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); | 728 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); |
| 723 | tsk->thread.trap_no = 14; | 729 | tsk->thread.trap_no = 14; |
| 724 | 730 | ||
| 725 | force_sig_info_fault(SIGSEGV, si_code, address, tsk); | 731 | force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); |
| 726 | 732 | ||
| 727 | return; | 733 | return; |
| 728 | } | 734 | } |
| @@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, | |||
| 807 | tsk->thread.trap_no = 14; | 813 | tsk->thread.trap_no = 14; |
| 808 | 814 | ||
| 809 | #ifdef CONFIG_MEMORY_FAILURE | 815 | #ifdef CONFIG_MEMORY_FAILURE |
| 810 | if (fault & VM_FAULT_HWPOISON) { | 816 | if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { |
| 811 | printk(KERN_ERR | 817 | printk(KERN_ERR |
| 812 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", | 818 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", |
| 813 | tsk->comm, tsk->pid, address); | 819 | tsk->comm, tsk->pid, address); |
| 814 | code = BUS_MCEERR_AR; | 820 | code = BUS_MCEERR_AR; |
| 815 | } | 821 | } |
| 816 | #endif | 822 | #endif |
| 817 | force_sig_info_fault(SIGBUS, code, address, tsk); | 823 | force_sig_info_fault(SIGBUS, code, address, tsk, fault); |
| 818 | } | 824 | } |
| 819 | 825 | ||
| 820 | static noinline void | 826 | static noinline void |
| @@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, | |||
| 824 | if (fault & VM_FAULT_OOM) { | 830 | if (fault & VM_FAULT_OOM) { |
| 825 | out_of_memory(regs, error_code, address); | 831 | out_of_memory(regs, error_code, address); |
| 826 | } else { | 832 | } else { |
| 827 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) | 833 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| |
| 834 | VM_FAULT_HWPOISON_LARGE)) | ||
| 828 | do_sigbus(regs, error_code, address, fault); | 835 | do_sigbus(regs, error_code, address, fault); |
| 829 | else | 836 | else |
| 830 | BUG(); | 837 | BUG(); |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 113eba3d3c38..a14328d270e8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
| @@ -31,6 +31,7 @@ | |||
| 31 | #include <linux/statfs.h> | 31 | #include <linux/statfs.h> |
| 32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
| 33 | #include <linux/magic.h> | 33 | #include <linux/magic.h> |
| 34 | #include <linux/migrate.h> | ||
| 34 | 35 | ||
| 35 | #include <asm/uaccess.h> | 36 | #include <asm/uaccess.h> |
| 36 | 37 | ||
| @@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page) | |||
| 573 | return 0; | 574 | return 0; |
| 574 | } | 575 | } |
| 575 | 576 | ||
| 577 | static int hugetlbfs_migrate_page(struct address_space *mapping, | ||
| 578 | struct page *newpage, struct page *page) | ||
| 579 | { | ||
| 580 | int rc; | ||
| 581 | |||
| 582 | rc = migrate_huge_page_move_mapping(mapping, newpage, page); | ||
| 583 | if (rc) | ||
| 584 | return rc; | ||
| 585 | migrate_page_copy(newpage, page); | ||
| 586 | |||
| 587 | return 0; | ||
| 588 | } | ||
| 589 | |||
| 576 | static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) | 590 | static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) |
| 577 | { | 591 | { |
| 578 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); | 592 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); |
| @@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = { | |||
| 659 | .write_begin = hugetlbfs_write_begin, | 673 | .write_begin = hugetlbfs_write_begin, |
| 660 | .write_end = hugetlbfs_write_end, | 674 | .write_end = hugetlbfs_write_end, |
| 661 | .set_page_dirty = hugetlbfs_set_page_dirty, | 675 | .set_page_dirty = hugetlbfs_set_page_dirty, |
| 676 | .migratepage = hugetlbfs_migrate_page, | ||
| 662 | }; | 677 | }; |
| 663 | 678 | ||
| 664 | 679 | ||
diff --git a/fs/signalfd.c b/fs/signalfd.c index 74047304b01a..492465b451dd 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c | |||
| @@ -99,6 +99,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, | |||
| 99 | #ifdef __ARCH_SI_TRAPNO | 99 | #ifdef __ARCH_SI_TRAPNO |
| 100 | err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); | 100 | err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); |
| 101 | #endif | 101 | #endif |
| 102 | #ifdef BUS_MCEERR_AO | ||
| 103 | /* | ||
| 104 | * Other callers might not initialize the si_lsb field, | ||
| 105 | * so check explicitly for the right codes here. | ||
| 106 | */ | ||
| 107 | if (kinfo->si_code == BUS_MCEERR_AR || | ||
| 108 | kinfo->si_code == BUS_MCEERR_AO) | ||
| 109 | err |= __put_user((short) kinfo->si_addr_lsb, | ||
| 110 | &uinfo->ssi_addr_lsb); | ||
| 111 | #endif | ||
| 102 | break; | 112 | break; |
| 103 | case __SI_CHLD: | 113 | case __SI_CHLD: |
| 104 | err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); | 114 | err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f479700df61b..943c76b3d4bb 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
| @@ -43,7 +43,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to, | |||
| 43 | struct vm_area_struct *vma, | 43 | struct vm_area_struct *vma, |
| 44 | int acctflags); | 44 | int acctflags); |
| 45 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); | 45 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); |
| 46 | void __isolate_hwpoisoned_huge_page(struct page *page); | 46 | int dequeue_hwpoisoned_huge_page(struct page *page); |
| 47 | void copy_huge_page(struct page *dst, struct page *src); | ||
| 47 | 48 | ||
| 48 | extern unsigned long hugepages_treat_as_movable; | 49 | extern unsigned long hugepages_treat_as_movable; |
| 49 | extern const unsigned long hugetlb_zero, hugetlb_infinity; | 50 | extern const unsigned long hugetlb_zero, hugetlb_infinity; |
| @@ -101,7 +102,10 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) | |||
| 101 | #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) | 102 | #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) |
| 102 | #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) | 103 | #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) |
| 103 | #define huge_pte_offset(mm, address) 0 | 104 | #define huge_pte_offset(mm, address) 0 |
| 104 | #define __isolate_hwpoisoned_huge_page(page) 0 | 105 | #define dequeue_hwpoisoned_huge_page(page) 0 |
| 106 | static inline void copy_huge_page(struct page *dst, struct page *src) | ||
| 107 | { | ||
| 108 | } | ||
| 105 | 109 | ||
| 106 | #define hugetlb_change_protection(vma, address, end, newprot) | 110 | #define hugetlb_change_protection(vma, address, end, newprot) |
| 107 | 111 | ||
| @@ -228,6 +232,8 @@ struct huge_bootmem_page { | |||
| 228 | struct hstate *hstate; | 232 | struct hstate *hstate; |
| 229 | }; | 233 | }; |
| 230 | 234 | ||
| 235 | struct page *alloc_huge_page_node(struct hstate *h, int nid); | ||
| 236 | |||
| 231 | /* arch callback */ | 237 | /* arch callback */ |
| 232 | int __init alloc_bootmem_huge_page(struct hstate *h); | 238 | int __init alloc_bootmem_huge_page(struct hstate *h); |
| 233 | 239 | ||
| @@ -301,8 +307,14 @@ static inline struct hstate *page_hstate(struct page *page) | |||
| 301 | return size_to_hstate(PAGE_SIZE << compound_order(page)); | 307 | return size_to_hstate(PAGE_SIZE << compound_order(page)); |
| 302 | } | 308 | } |
| 303 | 309 | ||
| 310 | static inline unsigned hstate_index_to_shift(unsigned index) | ||
| 311 | { | ||
| 312 | return hstates[index].order + PAGE_SHIFT; | ||
| 313 | } | ||
| 314 | |||
| 304 | #else | 315 | #else |
| 305 | struct hstate {}; | 316 | struct hstate {}; |
| 317 | #define alloc_huge_page_node(h, nid) NULL | ||
| 306 | #define alloc_bootmem_huge_page(h) NULL | 318 | #define alloc_bootmem_huge_page(h) NULL |
| 307 | #define hstate_file(f) NULL | 319 | #define hstate_file(f) NULL |
| 308 | #define hstate_vma(v) NULL | 320 | #define hstate_vma(v) NULL |
| @@ -317,6 +329,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h) | |||
| 317 | { | 329 | { |
| 318 | return 1; | 330 | return 1; |
| 319 | } | 331 | } |
| 332 | #define hstate_index_to_shift(index) 0 | ||
| 320 | #endif | 333 | #endif |
| 321 | 334 | ||
| 322 | #endif /* _LINUX_HUGETLB_H */ | 335 | #endif /* _LINUX_HUGETLB_H */ |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 7238231b8dd4..085527fb8261 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
| @@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *, | |||
| 14 | struct page *, struct page *); | 14 | struct page *, struct page *); |
| 15 | extern int migrate_pages(struct list_head *l, new_page_t x, | 15 | extern int migrate_pages(struct list_head *l, new_page_t x, |
| 16 | unsigned long private, int offlining); | 16 | unsigned long private, int offlining); |
| 17 | extern int migrate_huge_pages(struct list_head *l, new_page_t x, | ||
| 18 | unsigned long private, int offlining); | ||
| 17 | 19 | ||
| 18 | extern int fail_migrate_page(struct address_space *, | 20 | extern int fail_migrate_page(struct address_space *, |
| 19 | struct page *, struct page *); | 21 | struct page *, struct page *); |
| @@ -23,12 +25,17 @@ extern int migrate_prep_local(void); | |||
| 23 | extern int migrate_vmas(struct mm_struct *mm, | 25 | extern int migrate_vmas(struct mm_struct *mm, |
| 24 | const nodemask_t *from, const nodemask_t *to, | 26 | const nodemask_t *from, const nodemask_t *to, |
| 25 | unsigned long flags); | 27 | unsigned long flags); |
| 28 | extern void migrate_page_copy(struct page *newpage, struct page *page); | ||
| 29 | extern int migrate_huge_page_move_mapping(struct address_space *mapping, | ||
| 30 | struct page *newpage, struct page *page); | ||
| 26 | #else | 31 | #else |
| 27 | #define PAGE_MIGRATION 0 | 32 | #define PAGE_MIGRATION 0 |
| 28 | 33 | ||
| 29 | static inline void putback_lru_pages(struct list_head *l) {} | 34 | static inline void putback_lru_pages(struct list_head *l) {} |
| 30 | static inline int migrate_pages(struct list_head *l, new_page_t x, | 35 | static inline int migrate_pages(struct list_head *l, new_page_t x, |
| 31 | unsigned long private, int offlining) { return -ENOSYS; } | 36 | unsigned long private, int offlining) { return -ENOSYS; } |
| 37 | static inline int migrate_huge_pages(struct list_head *l, new_page_t x, | ||
| 38 | unsigned long private, int offlining) { return -ENOSYS; } | ||
| 32 | 39 | ||
| 33 | static inline int migrate_prep(void) { return -ENOSYS; } | 40 | static inline int migrate_prep(void) { return -ENOSYS; } |
| 34 | static inline int migrate_prep_local(void) { return -ENOSYS; } | 41 | static inline int migrate_prep_local(void) { return -ENOSYS; } |
| @@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm, | |||
| 40 | return -ENOSYS; | 47 | return -ENOSYS; |
| 41 | } | 48 | } |
| 42 | 49 | ||
| 50 | static inline void migrate_page_copy(struct page *newpage, | ||
| 51 | struct page *page) {} | ||
| 52 | |||
| 53 | static inline int migrate_huge_page_move_mapping(struct address_space *mapping, | ||
| 54 | struct page *newpage, struct page *page) | ||
| 55 | { | ||
| 56 | return -ENOSYS; | ||
| 57 | } | ||
| 58 | |||
| 43 | /* Possible settings for the migrate_page() method in address_operations */ | 59 | /* Possible settings for the migrate_page() method in address_operations */ |
| 44 | #define migrate_page NULL | 60 | #define migrate_page NULL |
| 45 | #define fail_migrate_page NULL | 61 | #define fail_migrate_page NULL |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 7687228dd3b7..a4c66846fb8f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
| @@ -718,12 +718,20 @@ static inline int page_mapped(struct page *page) | |||
| 718 | #define VM_FAULT_SIGBUS 0x0002 | 718 | #define VM_FAULT_SIGBUS 0x0002 |
| 719 | #define VM_FAULT_MAJOR 0x0004 | 719 | #define VM_FAULT_MAJOR 0x0004 |
| 720 | #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ | 720 | #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ |
| 721 | #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */ | 721 | #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */ |
| 722 | #define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */ | ||
| 722 | 723 | ||
| 723 | #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ | 724 | #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ |
| 724 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ | 725 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ |
| 725 | 726 | ||
| 726 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON) | 727 | #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ |
| 728 | |||
| 729 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ | ||
| 730 | VM_FAULT_HWPOISON_LARGE) | ||
| 731 | |||
| 732 | /* Encode hstate index for a hwpoisoned large page */ | ||
| 733 | #define VM_FAULT_SET_HINDEX(x) ((x) << 12) | ||
| 734 | #define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf) | ||
| 727 | 735 | ||
| 728 | /* | 736 | /* |
| 729 | * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. | 737 | * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. |
diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h index b363b916c909..3ff4961da9b5 100644 --- a/include/linux/signalfd.h +++ b/include/linux/signalfd.h | |||
| @@ -33,6 +33,7 @@ struct signalfd_siginfo { | |||
| 33 | __u64 ssi_utime; | 33 | __u64 ssi_utime; |
| 34 | __u64 ssi_stime; | 34 | __u64 ssi_stime; |
| 35 | __u64 ssi_addr; | 35 | __u64 ssi_addr; |
| 36 | __u16 ssi_addr_lsb; | ||
| 36 | 37 | ||
| 37 | /* | 38 | /* |
| 38 | * Pad strcture to 128 bytes. Remember to update the | 39 | * Pad strcture to 128 bytes. Remember to update the |
| @@ -43,7 +44,7 @@ struct signalfd_siginfo { | |||
| 43 | * comes out of a read(2) and we really don't want to have | 44 | * comes out of a read(2) and we really don't want to have |
| 44 | * a compat on read(2). | 45 | * a compat on read(2). |
| 45 | */ | 46 | */ |
| 46 | __u8 __pad[48]; | 47 | __u8 __pad[46]; |
| 47 | }; | 48 | }; |
| 48 | 49 | ||
| 49 | 50 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c03273807182..96991ded82fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page, | |||
| 423 | } | 423 | } |
| 424 | } | 424 | } |
| 425 | 425 | ||
| 426 | static void copy_gigantic_page(struct page *dst, struct page *src, | 426 | static void copy_user_gigantic_page(struct page *dst, struct page *src, |
| 427 | unsigned long addr, struct vm_area_struct *vma) | 427 | unsigned long addr, struct vm_area_struct *vma) |
| 428 | { | 428 | { |
| 429 | int i; | 429 | int i; |
| 430 | struct hstate *h = hstate_vma(vma); | 430 | struct hstate *h = hstate_vma(vma); |
| 431 | struct page *dst_base = dst; | 431 | struct page *dst_base = dst; |
| 432 | struct page *src_base = src; | 432 | struct page *src_base = src; |
| 433 | might_sleep(); | 433 | |
| 434 | for (i = 0; i < pages_per_huge_page(h); ) { | 434 | for (i = 0; i < pages_per_huge_page(h); ) { |
| 435 | cond_resched(); | 435 | cond_resched(); |
| 436 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | 436 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); |
| @@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src, | |||
| 440 | src = mem_map_next(src, src_base, i); | 440 | src = mem_map_next(src, src_base, i); |
| 441 | } | 441 | } |
| 442 | } | 442 | } |
| 443 | static void copy_huge_page(struct page *dst, struct page *src, | 443 | |
| 444 | static void copy_user_huge_page(struct page *dst, struct page *src, | ||
| 444 | unsigned long addr, struct vm_area_struct *vma) | 445 | unsigned long addr, struct vm_area_struct *vma) |
| 445 | { | 446 | { |
| 446 | int i; | 447 | int i; |
| 447 | struct hstate *h = hstate_vma(vma); | 448 | struct hstate *h = hstate_vma(vma); |
| 448 | 449 | ||
| 449 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { | 450 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { |
| 450 | copy_gigantic_page(dst, src, addr, vma); | 451 | copy_user_gigantic_page(dst, src, addr, vma); |
| 451 | return; | 452 | return; |
| 452 | } | 453 | } |
| 453 | 454 | ||
| @@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src, | |||
| 458 | } | 459 | } |
| 459 | } | 460 | } |
| 460 | 461 | ||
| 462 | static void copy_gigantic_page(struct page *dst, struct page *src) | ||
| 463 | { | ||
| 464 | int i; | ||
| 465 | struct hstate *h = page_hstate(src); | ||
| 466 | struct page *dst_base = dst; | ||
| 467 | struct page *src_base = src; | ||
| 468 | |||
| 469 | for (i = 0; i < pages_per_huge_page(h); ) { | ||
| 470 | cond_resched(); | ||
| 471 | copy_highpage(dst, src); | ||
| 472 | |||
| 473 | i++; | ||
| 474 | dst = mem_map_next(dst, dst_base, i); | ||
| 475 | src = mem_map_next(src, src_base, i); | ||
| 476 | } | ||
| 477 | } | ||
| 478 | |||
| 479 | void copy_huge_page(struct page *dst, struct page *src) | ||
| 480 | { | ||
| 481 | int i; | ||
| 482 | struct hstate *h = page_hstate(src); | ||
| 483 | |||
| 484 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { | ||
| 485 | copy_gigantic_page(dst, src); | ||
| 486 | return; | ||
| 487 | } | ||
| 488 | |||
| 489 | might_sleep(); | ||
| 490 | for (i = 0; i < pages_per_huge_page(h); i++) { | ||
| 491 | cond_resched(); | ||
| 492 | copy_highpage(dst + i, src + i); | ||
| 493 | } | ||
| 494 | } | ||
| 495 | |||
| 461 | static void enqueue_huge_page(struct hstate *h, struct page *page) | 496 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
| 462 | { | 497 | { |
| 463 | int nid = page_to_nid(page); | 498 | int nid = page_to_nid(page); |
| @@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) | |||
| 466 | h->free_huge_pages_node[nid]++; | 501 | h->free_huge_pages_node[nid]++; |
| 467 | } | 502 | } |
| 468 | 503 | ||
| 504 | static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | ||
| 505 | { | ||
| 506 | struct page *page; | ||
| 507 | |||
| 508 | if (list_empty(&h->hugepage_freelists[nid])) | ||
| 509 | return NULL; | ||
| 510 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); | ||
| 511 | list_del(&page->lru); | ||
| 512 | set_page_refcounted(page); | ||
| 513 | h->free_huge_pages--; | ||
| 514 | h->free_huge_pages_node[nid]--; | ||
| 515 | return page; | ||
| 516 | } | ||
| 517 | |||
| 469 | static struct page *dequeue_huge_page_vma(struct hstate *h, | 518 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
| 470 | struct vm_area_struct *vma, | 519 | struct vm_area_struct *vma, |
| 471 | unsigned long address, int avoid_reserve) | 520 | unsigned long address, int avoid_reserve) |
| 472 | { | 521 | { |
| 473 | int nid; | ||
| 474 | struct page *page = NULL; | 522 | struct page *page = NULL; |
| 475 | struct mempolicy *mpol; | 523 | struct mempolicy *mpol; |
| 476 | nodemask_t *nodemask; | 524 | nodemask_t *nodemask; |
| @@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
| 496 | 544 | ||
| 497 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 545 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
| 498 | MAX_NR_ZONES - 1, nodemask) { | 546 | MAX_NR_ZONES - 1, nodemask) { |
| 499 | nid = zone_to_nid(zone); | 547 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { |
| 500 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && | 548 | page = dequeue_huge_page_node(h, zone_to_nid(zone)); |
| 501 | !list_empty(&h->hugepage_freelists[nid])) { | 549 | if (page) { |
| 502 | page = list_entry(h->hugepage_freelists[nid].next, | 550 | if (!avoid_reserve) |
| 503 | struct page, lru); | 551 | decrement_hugepage_resv_vma(h, vma); |
| 504 | list_del(&page->lru); | 552 | break; |
| 505 | h->free_huge_pages--; | 553 | } |
| 506 | h->free_huge_pages_node[nid]--; | ||
| 507 | |||
| 508 | if (!avoid_reserve) | ||
| 509 | decrement_hugepage_resv_vma(h, vma); | ||
| 510 | |||
| 511 | break; | ||
| 512 | } | 554 | } |
| 513 | } | 555 | } |
| 514 | err: | 556 | err: |
| @@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, | |||
| 770 | return ret; | 812 | return ret; |
| 771 | } | 813 | } |
| 772 | 814 | ||
| 773 | static struct page *alloc_buddy_huge_page(struct hstate *h, | 815 | static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) |
| 774 | struct vm_area_struct *vma, unsigned long address) | ||
| 775 | { | 816 | { |
| 776 | struct page *page; | 817 | struct page *page; |
| 777 | unsigned int nid; | 818 | unsigned int r_nid; |
| 778 | 819 | ||
| 779 | if (h->order >= MAX_ORDER) | 820 | if (h->order >= MAX_ORDER) |
| 780 | return NULL; | 821 | return NULL; |
| @@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
| 812 | } | 853 | } |
| 813 | spin_unlock(&hugetlb_lock); | 854 | spin_unlock(&hugetlb_lock); |
| 814 | 855 | ||
| 815 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| | 856 | if (nid == NUMA_NO_NODE) |
| 816 | __GFP_REPEAT|__GFP_NOWARN, | 857 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| |
| 817 | huge_page_order(h)); | 858 | __GFP_REPEAT|__GFP_NOWARN, |
| 859 | huge_page_order(h)); | ||
| 860 | else | ||
| 861 | page = alloc_pages_exact_node(nid, | ||
| 862 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | ||
| 863 | __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); | ||
| 818 | 864 | ||
| 819 | if (page && arch_prepare_hugepage(page)) { | 865 | if (page && arch_prepare_hugepage(page)) { |
| 820 | __free_pages(page, huge_page_order(h)); | 866 | __free_pages(page, huge_page_order(h)); |
| @@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
| 823 | 869 | ||
| 824 | spin_lock(&hugetlb_lock); | 870 | spin_lock(&hugetlb_lock); |
| 825 | if (page) { | 871 | if (page) { |
| 826 | /* | 872 | r_nid = page_to_nid(page); |
| 827 | * This page is now managed by the hugetlb allocator and has | ||
| 828 | * no users -- drop the buddy allocator's reference. | ||
| 829 | */ | ||
| 830 | put_page_testzero(page); | ||
| 831 | VM_BUG_ON(page_count(page)); | ||
| 832 | nid = page_to_nid(page); | ||
| 833 | set_compound_page_dtor(page, free_huge_page); | 873 | set_compound_page_dtor(page, free_huge_page); |
| 834 | /* | 874 | /* |
| 835 | * We incremented the global counters already | 875 | * We incremented the global counters already |
| 836 | */ | 876 | */ |
| 837 | h->nr_huge_pages_node[nid]++; | 877 | h->nr_huge_pages_node[r_nid]++; |
| 838 | h->surplus_huge_pages_node[nid]++; | 878 | h->surplus_huge_pages_node[r_nid]++; |
| 839 | __count_vm_event(HTLB_BUDDY_PGALLOC); | 879 | __count_vm_event(HTLB_BUDDY_PGALLOC); |
| 840 | } else { | 880 | } else { |
| 841 | h->nr_huge_pages--; | 881 | h->nr_huge_pages--; |
| @@ -848,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
| 848 | } | 888 | } |
| 849 | 889 | ||
| 850 | /* | 890 | /* |
| 891 | * This allocation function is useful in the context where vma is irrelevant. | ||
| 892 | * E.g. soft-offlining uses this function because it only cares physical | ||
| 893 | * address of error page. | ||
| 894 | */ | ||
| 895 | struct page *alloc_huge_page_node(struct hstate *h, int nid) | ||
| 896 | { | ||
| 897 | struct page *page; | ||
| 898 | |||
| 899 | spin_lock(&hugetlb_lock); | ||
| 900 | page = dequeue_huge_page_node(h, nid); | ||
| 901 | spin_unlock(&hugetlb_lock); | ||
| 902 | |||
| 903 | if (!page) | ||
| 904 | page = alloc_buddy_huge_page(h, nid); | ||
| 905 | |||
| 906 | return page; | ||
| 907 | } | ||
| 908 | |||
| 909 | /* | ||
| 851 | * Increase the hugetlb pool such that it can accomodate a reservation | 910 | * Increase the hugetlb pool such that it can accomodate a reservation |
| 852 | * of size 'delta'. | 911 | * of size 'delta'. |
| 853 | */ | 912 | */ |
| @@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta) | |||
| 871 | retry: | 930 | retry: |
| 872 | spin_unlock(&hugetlb_lock); | 931 | spin_unlock(&hugetlb_lock); |
| 873 | for (i = 0; i < needed; i++) { | 932 | for (i = 0; i < needed; i++) { |
| 874 | page = alloc_buddy_huge_page(h, NULL, 0); | 933 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
| 875 | if (!page) { | 934 | if (!page) |
| 876 | /* | 935 | /* |
| 877 | * We were not able to allocate enough pages to | 936 | * We were not able to allocate enough pages to |
| 878 | * satisfy the entire reservation so we free what | 937 | * satisfy the entire reservation so we free what |
| 879 | * we've allocated so far. | 938 | * we've allocated so far. |
| 880 | */ | 939 | */ |
| 881 | spin_lock(&hugetlb_lock); | ||
| 882 | needed = 0; | ||
| 883 | goto free; | 940 | goto free; |
| 884 | } | ||
| 885 | 941 | ||
| 886 | list_add(&page->lru, &surplus_list); | 942 | list_add(&page->lru, &surplus_list); |
| 887 | } | 943 | } |
| @@ -908,31 +964,31 @@ retry: | |||
| 908 | needed += allocated; | 964 | needed += allocated; |
| 909 | h->resv_huge_pages += delta; | 965 | h->resv_huge_pages += delta; |
| 910 | ret = 0; | 966 | ret = 0; |
| 911 | free: | 967 | |
| 968 | spin_unlock(&hugetlb_lock); | ||
| 912 | /* Free the needed pages to the hugetlb pool */ | 969 | /* Free the needed pages to the hugetlb pool */ |
| 913 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 970 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
| 914 | if ((--needed) < 0) | 971 | if ((--needed) < 0) |
| 915 | break; | 972 | break; |
| 916 | list_del(&page->lru); | 973 | list_del(&page->lru); |
| 974 | /* | ||
| 975 | * This page is now managed by the hugetlb allocator and has | ||
| 976 | * no users -- drop the buddy allocator's reference. | ||
| 977 | */ | ||
| 978 | put_page_testzero(page); | ||
| 979 | VM_BUG_ON(page_count(page)); | ||
| 917 | enqueue_huge_page(h, page); | 980 | enqueue_huge_page(h, page); |
| 918 | } | 981 | } |
| 919 | 982 | ||
| 920 | /* Free unnecessary surplus pages to the buddy allocator */ | 983 | /* Free unnecessary surplus pages to the buddy allocator */ |
| 984 | free: | ||
| 921 | if (!list_empty(&surplus_list)) { | 985 | if (!list_empty(&surplus_list)) { |
| 922 | spin_unlock(&hugetlb_lock); | ||
| 923 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 986 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
| 924 | list_del(&page->lru); | 987 | list_del(&page->lru); |
| 925 | /* | 988 | put_page(page); |
| 926 | * The page has a reference count of zero already, so | ||
| 927 | * call free_huge_page directly instead of using | ||
| 928 | * put_page. This must be done with hugetlb_lock | ||
| 929 | * unlocked which is safe because free_huge_page takes | ||
| 930 | * hugetlb_lock before deciding how to free the page. | ||
| 931 | */ | ||
| 932 | free_huge_page(page); | ||
| 933 | } | 989 | } |
| 934 | spin_lock(&hugetlb_lock); | ||
| 935 | } | 990 | } |
| 991 | spin_lock(&hugetlb_lock); | ||
| 936 | 992 | ||
| 937 | return ret; | 993 | return ret; |
| 938 | } | 994 | } |
| @@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
| 1052 | spin_unlock(&hugetlb_lock); | 1108 | spin_unlock(&hugetlb_lock); |
| 1053 | 1109 | ||
| 1054 | if (!page) { | 1110 | if (!page) { |
| 1055 | page = alloc_buddy_huge_page(h, vma, addr); | 1111 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
| 1056 | if (!page) { | 1112 | if (!page) { |
| 1057 | hugetlb_put_quota(inode->i_mapping, chg); | 1113 | hugetlb_put_quota(inode->i_mapping, chg); |
| 1058 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1114 | return ERR_PTR(-VM_FAULT_SIGBUS); |
| 1059 | } | 1115 | } |
| 1060 | } | 1116 | } |
| 1061 | 1117 | ||
| 1062 | set_page_refcounted(page); | ||
| 1063 | set_page_private(page, (unsigned long) mapping); | 1118 | set_page_private(page, (unsigned long) mapping); |
| 1064 | 1119 | ||
| 1065 | vma_commit_reservation(h, vma, addr); | 1120 | vma_commit_reservation(h, vma, addr); |
| @@ -2153,6 +2208,19 @@ nomem: | |||
| 2153 | return -ENOMEM; | 2208 | return -ENOMEM; |
| 2154 | } | 2209 | } |
| 2155 | 2210 | ||
| 2211 | static int is_hugetlb_entry_migration(pte_t pte) | ||
| 2212 | { | ||
| 2213 | swp_entry_t swp; | ||
| 2214 | |||
| 2215 | if (huge_pte_none(pte) || pte_present(pte)) | ||
| 2216 | return 0; | ||
| 2217 | swp = pte_to_swp_entry(pte); | ||
| 2218 | if (non_swap_entry(swp) && is_migration_entry(swp)) { | ||
| 2219 | return 1; | ||
| 2220 | } else | ||
| 2221 | return 0; | ||
| 2222 | } | ||
| 2223 | |||
| 2156 | static int is_hugetlb_entry_hwpoisoned(pte_t pte) | 2224 | static int is_hugetlb_entry_hwpoisoned(pte_t pte) |
| 2157 | { | 2225 | { |
| 2158 | swp_entry_t swp; | 2226 | swp_entry_t swp; |
| @@ -2383,7 +2451,7 @@ retry_avoidcopy: | |||
| 2383 | if (unlikely(anon_vma_prepare(vma))) | 2451 | if (unlikely(anon_vma_prepare(vma))) |
| 2384 | return VM_FAULT_OOM; | 2452 | return VM_FAULT_OOM; |
| 2385 | 2453 | ||
| 2386 | copy_huge_page(new_page, old_page, address, vma); | 2454 | copy_user_huge_page(new_page, old_page, address, vma); |
| 2387 | __SetPageUptodate(new_page); | 2455 | __SetPageUptodate(new_page); |
| 2388 | 2456 | ||
| 2389 | /* | 2457 | /* |
| @@ -2515,22 +2583,20 @@ retry: | |||
| 2515 | hugepage_add_new_anon_rmap(page, vma, address); | 2583 | hugepage_add_new_anon_rmap(page, vma, address); |
| 2516 | } | 2584 | } |
| 2517 | } else { | 2585 | } else { |
| 2586 | /* | ||
| 2587 | * If memory error occurs between mmap() and fault, some process | ||
| 2588 | * don't have hwpoisoned swap entry for errored virtual address. | ||
| 2589 | * So we need to block hugepage fault by PG_hwpoison bit check. | ||
| 2590 | */ | ||
| 2591 | if (unlikely(PageHWPoison(page))) { | ||
| 2592 | ret = VM_FAULT_HWPOISON | | ||
| 2593 | VM_FAULT_SET_HINDEX(h - hstates); | ||
| 2594 | goto backout_unlocked; | ||
| 2595 | } | ||
| 2518 | page_dup_rmap(page); | 2596 | page_dup_rmap(page); |
| 2519 | } | 2597 | } |
| 2520 | 2598 | ||
| 2521 | /* | 2599 | /* |
| 2522 | * Since memory error handler replaces pte into hwpoison swap entry | ||
| 2523 | * at the time of error handling, a process which reserved but not have | ||
| 2524 | * the mapping to the error hugepage does not have hwpoison swap entry. | ||
| 2525 | * So we need to block accesses from such a process by checking | ||
| 2526 | * PG_hwpoison bit here. | ||
| 2527 | */ | ||
| 2528 | if (unlikely(PageHWPoison(page))) { | ||
| 2529 | ret = VM_FAULT_HWPOISON; | ||
| 2530 | goto backout_unlocked; | ||
| 2531 | } | ||
| 2532 | |||
| 2533 | /* | ||
| 2534 | * If we are going to COW a private mapping later, we examine the | 2600 | * If we are going to COW a private mapping later, we examine the |
| 2535 | * pending reservations for this page now. This will ensure that | 2601 | * pending reservations for this page now. This will ensure that |
| 2536 | * any allocations necessary to record that reservation occur outside | 2602 | * any allocations necessary to record that reservation occur outside |
| @@ -2587,8 +2653,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2587 | ptep = huge_pte_offset(mm, address); | 2653 | ptep = huge_pte_offset(mm, address); |
| 2588 | if (ptep) { | 2654 | if (ptep) { |
| 2589 | entry = huge_ptep_get(ptep); | 2655 | entry = huge_ptep_get(ptep); |
| 2590 | if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 2656 | if (unlikely(is_hugetlb_entry_migration(entry))) { |
| 2591 | return VM_FAULT_HWPOISON; | 2657 | migration_entry_wait(mm, (pmd_t *)ptep, address); |
| 2658 | return 0; | ||
| 2659 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | ||
| 2660 | return VM_FAULT_HWPOISON_LARGE | | ||
| 2661 | VM_FAULT_SET_HINDEX(h - hstates); | ||
| 2592 | } | 2662 | } |
| 2593 | 2663 | ||
| 2594 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); | 2664 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
| @@ -2878,18 +2948,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
| 2878 | hugetlb_acct_memory(h, -(chg - freed)); | 2948 | hugetlb_acct_memory(h, -(chg - freed)); |
| 2879 | } | 2949 | } |
| 2880 | 2950 | ||
| 2951 | #ifdef CONFIG_MEMORY_FAILURE | ||
| 2952 | |||
| 2953 | /* Should be called in hugetlb_lock */ | ||
| 2954 | static int is_hugepage_on_freelist(struct page *hpage) | ||
| 2955 | { | ||
| 2956 | struct page *page; | ||
| 2957 | struct page *tmp; | ||
| 2958 | struct hstate *h = page_hstate(hpage); | ||
| 2959 | int nid = page_to_nid(hpage); | ||
| 2960 | |||
| 2961 | list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) | ||
| 2962 | if (page == hpage) | ||
| 2963 | return 1; | ||
| 2964 | return 0; | ||
| 2965 | } | ||
| 2966 | |||
| 2881 | /* | 2967 | /* |
| 2882 | * This function is called from memory failure code. | 2968 | * This function is called from memory failure code. |
| 2883 | * Assume the caller holds page lock of the head page. | 2969 | * Assume the caller holds page lock of the head page. |
| 2884 | */ | 2970 | */ |
| 2885 | void __isolate_hwpoisoned_huge_page(struct page *hpage) | 2971 | int dequeue_hwpoisoned_huge_page(struct page *hpage) |
| 2886 | { | 2972 | { |
| 2887 | struct hstate *h = page_hstate(hpage); | 2973 | struct hstate *h = page_hstate(hpage); |
| 2888 | int nid = page_to_nid(hpage); | 2974 | int nid = page_to_nid(hpage); |
| 2975 | int ret = -EBUSY; | ||
| 2889 | 2976 | ||
| 2890 | spin_lock(&hugetlb_lock); | 2977 | spin_lock(&hugetlb_lock); |
| 2891 | list_del(&hpage->lru); | 2978 | if (is_hugepage_on_freelist(hpage)) { |
| 2892 | h->free_huge_pages--; | 2979 | list_del(&hpage->lru); |
| 2893 | h->free_huge_pages_node[nid]--; | 2980 | set_page_refcounted(hpage); |
| 2981 | h->free_huge_pages--; | ||
| 2982 | h->free_huge_pages_node[nid]--; | ||
| 2983 | ret = 0; | ||
| 2984 | } | ||
| 2894 | spin_unlock(&hugetlb_lock); | 2985 | spin_unlock(&hugetlb_lock); |
| 2986 | return ret; | ||
| 2895 | } | 2987 | } |
| 2988 | #endif | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 757f6b0accfe..44a8cefeae6e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
| @@ -7,21 +7,26 @@ | |||
| 7 | * Free Software Foundation. | 7 | * Free Software Foundation. |
| 8 | * | 8 | * |
| 9 | * High level machine check handler. Handles pages reported by the | 9 | * High level machine check handler. Handles pages reported by the |
| 10 | * hardware as being corrupted usually due to a 2bit ECC memory or cache | 10 | * hardware as being corrupted usually due to a multi-bit ECC memory or cache |
| 11 | * failure. | 11 | * failure. |
| 12 | * | ||
| 13 | * In addition there is a "soft offline" entry point that allows stop using | ||
| 14 | * not-yet-corrupted-by-suspicious pages without killing anything. | ||
| 12 | * | 15 | * |
| 13 | * Handles page cache pages in various states. The tricky part | 16 | * Handles page cache pages in various states. The tricky part |
| 14 | * here is that we can access any page asynchronous to other VM | 17 | * here is that we can access any page asynchronously in respect to |
| 15 | * users, because memory failures could happen anytime and anywhere, | 18 | * other VM users, because memory failures could happen anytime and |
| 16 | * possibly violating some of their assumptions. This is why this code | 19 | * anywhere. This could violate some of their assumptions. This is why |
| 17 | * has to be extremely careful. Generally it tries to use normal locking | 20 | * this code has to be extremely careful. Generally it tries to use |
| 18 | * rules, as in get the standard locks, even if that means the | 21 | * normal locking rules, as in get the standard locks, even if that means |
| 19 | * error handling takes potentially a long time. | 22 | * the error handling takes potentially a long time. |
| 20 | * | 23 | * |
| 21 | * The operation to map back from RMAP chains to processes has to walk | 24 | * There are several operations here with exponential complexity because |
| 22 | * the complete process list and has non linear complexity with the number | 25 | * of unsuitable VM data structures. For example the operation to map back |
| 23 | * mappings. In short it can be quite slow. But since memory corruptions | 26 | * from RMAP chains to processes has to walk the complete process list and |
| 24 | * are rare we hope to get away with this. | 27 | * has non linear complexity with the number. But since memory corruptions |
| 28 | * are rare we hope to get away with this. This avoids impacting the core | ||
| 29 | * VM. | ||
| 25 | */ | 30 | */ |
| 26 | 31 | ||
| 27 | /* | 32 | /* |
| @@ -30,7 +35,6 @@ | |||
| 30 | * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages | 35 | * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages |
| 31 | * - pass bad pages to kdump next kernel | 36 | * - pass bad pages to kdump next kernel |
| 32 | */ | 37 | */ |
| 33 | #define DEBUG 1 /* remove me in 2.6.34 */ | ||
| 34 | #include <linux/kernel.h> | 38 | #include <linux/kernel.h> |
| 35 | #include <linux/mm.h> | 39 | #include <linux/mm.h> |
| 36 | #include <linux/page-flags.h> | 40 | #include <linux/page-flags.h> |
| @@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p) | |||
| 78 | return 0; | 82 | return 0; |
| 79 | 83 | ||
| 80 | /* | 84 | /* |
| 81 | * page_mapping() does not accept slab page | 85 | * page_mapping() does not accept slab pages. |
| 82 | */ | 86 | */ |
| 83 | if (PageSlab(p)) | 87 | if (PageSlab(p)) |
| 84 | return -EINVAL; | 88 | return -EINVAL; |
| @@ -268,7 +272,7 @@ struct to_kill { | |||
| 268 | struct list_head nd; | 272 | struct list_head nd; |
| 269 | struct task_struct *tsk; | 273 | struct task_struct *tsk; |
| 270 | unsigned long addr; | 274 | unsigned long addr; |
| 271 | unsigned addr_valid:1; | 275 | char addr_valid; |
| 272 | }; | 276 | }; |
| 273 | 277 | ||
| 274 | /* | 278 | /* |
| @@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, | |||
| 309 | * a SIGKILL because the error is not contained anymore. | 313 | * a SIGKILL because the error is not contained anymore. |
| 310 | */ | 314 | */ |
| 311 | if (tk->addr == -EFAULT) { | 315 | if (tk->addr == -EFAULT) { |
| 312 | pr_debug("MCE: Unable to find user space address %lx in %s\n", | 316 | pr_info("MCE: Unable to find user space address %lx in %s\n", |
| 313 | page_to_pfn(p), tsk->comm); | 317 | page_to_pfn(p), tsk->comm); |
| 314 | tk->addr_valid = 0; | 318 | tk->addr_valid = 0; |
| 315 | } | 319 | } |
| @@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
| 577 | pfn, err); | 581 | pfn, err); |
| 578 | } else if (page_has_private(p) && | 582 | } else if (page_has_private(p) && |
| 579 | !try_to_release_page(p, GFP_NOIO)) { | 583 | !try_to_release_page(p, GFP_NOIO)) { |
| 580 | pr_debug("MCE %#lx: failed to release buffers\n", pfn); | 584 | pr_info("MCE %#lx: failed to release buffers\n", pfn); |
| 581 | } else { | 585 | } else { |
| 582 | ret = RECOVERED; | 586 | ret = RECOVERED; |
| 583 | } | 587 | } |
| @@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) | |||
| 693 | * Issues: | 697 | * Issues: |
| 694 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) | 698 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) |
| 695 | * To narrow down kill region to one page, we need to break up pmd. | 699 | * To narrow down kill region to one page, we need to break up pmd. |
| 696 | * - To support soft-offlining for hugepage, we need to support hugepage | ||
| 697 | * migration. | ||
| 698 | */ | 700 | */ |
| 699 | static int me_huge_page(struct page *p, unsigned long pfn) | 701 | static int me_huge_page(struct page *p, unsigned long pfn) |
| 700 | { | 702 | { |
| 703 | int res = 0; | ||
| 701 | struct page *hpage = compound_head(p); | 704 | struct page *hpage = compound_head(p); |
| 702 | /* | 705 | /* |
| 703 | * We can safely recover from error on free or reserved (i.e. | 706 | * We can safely recover from error on free or reserved (i.e. |
| @@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
| 710 | * so there is no race between isolation and mapping/unmapping. | 713 | * so there is no race between isolation and mapping/unmapping. |
| 711 | */ | 714 | */ |
| 712 | if (!(page_mapping(hpage) || PageAnon(hpage))) { | 715 | if (!(page_mapping(hpage) || PageAnon(hpage))) { |
| 713 | __isolate_hwpoisoned_huge_page(hpage); | 716 | res = dequeue_hwpoisoned_huge_page(hpage); |
| 714 | return RECOVERED; | 717 | if (!res) |
| 718 | return RECOVERED; | ||
| 715 | } | 719 | } |
| 716 | return DELAYED; | 720 | return DELAYED; |
| 717 | } | 721 | } |
| @@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p, | |||
| 836 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; | 840 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; |
| 837 | } | 841 | } |
| 838 | 842 | ||
| 839 | #define N_UNMAP_TRIES 5 | ||
| 840 | |||
| 841 | /* | 843 | /* |
| 842 | * Do all that is necessary to remove user space mappings. Unmap | 844 | * Do all that is necessary to remove user space mappings. Unmap |
| 843 | * the pages and send SIGBUS to the processes if the data was dirty. | 845 | * the pages and send SIGBUS to the processes if the data was dirty. |
| @@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
| 849 | struct address_space *mapping; | 851 | struct address_space *mapping; |
| 850 | LIST_HEAD(tokill); | 852 | LIST_HEAD(tokill); |
| 851 | int ret; | 853 | int ret; |
| 852 | int i; | ||
| 853 | int kill = 1; | 854 | int kill = 1; |
| 854 | struct page *hpage = compound_head(p); | 855 | struct page *hpage = compound_head(p); |
| 855 | 856 | ||
| @@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
| 903 | if (kill) | 904 | if (kill) |
| 904 | collect_procs(hpage, &tokill); | 905 | collect_procs(hpage, &tokill); |
| 905 | 906 | ||
| 906 | /* | 907 | ret = try_to_unmap(hpage, ttu); |
| 907 | * try_to_unmap can fail temporarily due to races. | ||
| 908 | * Try a few times (RED-PEN better strategy?) | ||
| 909 | */ | ||
| 910 | for (i = 0; i < N_UNMAP_TRIES; i++) { | ||
| 911 | ret = try_to_unmap(hpage, ttu); | ||
| 912 | if (ret == SWAP_SUCCESS) | ||
| 913 | break; | ||
| 914 | pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); | ||
| 915 | } | ||
| 916 | |||
| 917 | if (ret != SWAP_SUCCESS) | 908 | if (ret != SWAP_SUCCESS) |
| 918 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | 909 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", |
| 919 | pfn, page_mapcount(hpage)); | 910 | pfn, page_mapcount(hpage)); |
| @@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 981 | * We need/can do nothing about count=0 pages. | 972 | * We need/can do nothing about count=0 pages. |
| 982 | * 1) it's a free page, and therefore in safe hand: | 973 | * 1) it's a free page, and therefore in safe hand: |
| 983 | * prep_new_page() will be the gate keeper. | 974 | * prep_new_page() will be the gate keeper. |
| 984 | * 2) it's part of a non-compound high order page. | 975 | * 2) it's a free hugepage, which is also safe: |
| 976 | * an affected hugepage will be dequeued from hugepage freelist, | ||
| 977 | * so there's no concern about reusing it ever after. | ||
| 978 | * 3) it's part of a non-compound high order page. | ||
| 985 | * Implies some kernel user: cannot stop them from | 979 | * Implies some kernel user: cannot stop them from |
| 986 | * R/W the page; let's pray that the page has been | 980 | * R/W the page; let's pray that the page has been |
| 987 | * used and will be freed some time later. | 981 | * used and will be freed some time later. |
| @@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 993 | if (is_free_buddy_page(p)) { | 987 | if (is_free_buddy_page(p)) { |
| 994 | action_result(pfn, "free buddy", DELAYED); | 988 | action_result(pfn, "free buddy", DELAYED); |
| 995 | return 0; | 989 | return 0; |
| 990 | } else if (PageHuge(hpage)) { | ||
| 991 | /* | ||
| 992 | * Check "just unpoisoned", "filter hit", and | ||
| 993 | * "race with other subpage." | ||
| 994 | */ | ||
| 995 | lock_page_nosync(hpage); | ||
| 996 | if (!PageHWPoison(hpage) | ||
| 997 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) | ||
| 998 | || (p != hpage && TestSetPageHWPoison(hpage))) { | ||
| 999 | atomic_long_sub(nr_pages, &mce_bad_pages); | ||
| 1000 | return 0; | ||
| 1001 | } | ||
| 1002 | set_page_hwpoison_huge_page(hpage); | ||
| 1003 | res = dequeue_hwpoisoned_huge_page(hpage); | ||
| 1004 | action_result(pfn, "free huge", | ||
| 1005 | res ? IGNORED : DELAYED); | ||
| 1006 | unlock_page(hpage); | ||
| 1007 | return res; | ||
| 996 | } else { | 1008 | } else { |
| 997 | action_result(pfn, "high order kernel", IGNORED); | 1009 | action_result(pfn, "high order kernel", IGNORED); |
| 998 | return -EBUSY; | 1010 | return -EBUSY; |
| @@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn) | |||
| 1147 | page = compound_head(p); | 1159 | page = compound_head(p); |
| 1148 | 1160 | ||
| 1149 | if (!PageHWPoison(p)) { | 1161 | if (!PageHWPoison(p)) { |
| 1150 | pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); | 1162 | pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); |
| 1151 | return 0; | 1163 | return 0; |
| 1152 | } | 1164 | } |
| 1153 | 1165 | ||
| 1154 | nr_pages = 1 << compound_order(page); | 1166 | nr_pages = 1 << compound_order(page); |
| 1155 | 1167 | ||
| 1156 | if (!get_page_unless_zero(page)) { | 1168 | if (!get_page_unless_zero(page)) { |
| 1169 | /* | ||
| 1170 | * Since HWPoisoned hugepage should have non-zero refcount, | ||
| 1171 | * race between memory failure and unpoison seems to happen. | ||
| 1172 | * In such case unpoison fails and memory failure runs | ||
| 1173 | * to the end. | ||
| 1174 | */ | ||
| 1175 | if (PageHuge(page)) { | ||
| 1176 | pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); | ||
| 1177 | return 0; | ||
| 1178 | } | ||
| 1157 | if (TestClearPageHWPoison(p)) | 1179 | if (TestClearPageHWPoison(p)) |
| 1158 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1180 | atomic_long_sub(nr_pages, &mce_bad_pages); |
| 1159 | pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); | 1181 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); |
| 1160 | return 0; | 1182 | return 0; |
| 1161 | } | 1183 | } |
| 1162 | 1184 | ||
| @@ -1168,12 +1190,12 @@ int unpoison_memory(unsigned long pfn) | |||
| 1168 | * the free buddy page pool. | 1190 | * the free buddy page pool. |
| 1169 | */ | 1191 | */ |
| 1170 | if (TestClearPageHWPoison(page)) { | 1192 | if (TestClearPageHWPoison(page)) { |
| 1171 | pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); | 1193 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); |
| 1172 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1194 | atomic_long_sub(nr_pages, &mce_bad_pages); |
| 1173 | freeit = 1; | 1195 | freeit = 1; |
| 1196 | if (PageHuge(page)) | ||
| 1197 | clear_page_hwpoison_huge_page(page); | ||
| 1174 | } | 1198 | } |
| 1175 | if (PageHuge(p)) | ||
| 1176 | clear_page_hwpoison_huge_page(page); | ||
| 1177 | unlock_page(page); | 1199 | unlock_page(page); |
| 1178 | 1200 | ||
| 1179 | put_page(page); | 1201 | put_page(page); |
| @@ -1187,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory); | |||
| 1187 | static struct page *new_page(struct page *p, unsigned long private, int **x) | 1209 | static struct page *new_page(struct page *p, unsigned long private, int **x) |
| 1188 | { | 1210 | { |
| 1189 | int nid = page_to_nid(p); | 1211 | int nid = page_to_nid(p); |
| 1190 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | 1212 | if (PageHuge(p)) |
| 1213 | return alloc_huge_page_node(page_hstate(compound_head(p)), | ||
| 1214 | nid); | ||
| 1215 | else | ||
| 1216 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | ||
| 1191 | } | 1217 | } |
| 1192 | 1218 | ||
| 1193 | /* | 1219 | /* |
| @@ -1215,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
| 1215 | * was free. | 1241 | * was free. |
| 1216 | */ | 1242 | */ |
| 1217 | set_migratetype_isolate(p); | 1243 | set_migratetype_isolate(p); |
| 1244 | /* | ||
| 1245 | * When the target page is a free hugepage, just remove it | ||
| 1246 | * from free hugepage list. | ||
| 1247 | */ | ||
| 1218 | if (!get_page_unless_zero(compound_head(p))) { | 1248 | if (!get_page_unless_zero(compound_head(p))) { |
| 1219 | if (is_free_buddy_page(p)) { | 1249 | if (PageHuge(p)) { |
| 1220 | pr_debug("get_any_page: %#lx free buddy page\n", pfn); | 1250 | pr_info("get_any_page: %#lx free huge page\n", pfn); |
| 1251 | ret = dequeue_hwpoisoned_huge_page(compound_head(p)); | ||
| 1252 | } else if (is_free_buddy_page(p)) { | ||
| 1253 | pr_info("get_any_page: %#lx free buddy page\n", pfn); | ||
| 1221 | /* Set hwpoison bit while page is still isolated */ | 1254 | /* Set hwpoison bit while page is still isolated */ |
| 1222 | SetPageHWPoison(p); | 1255 | SetPageHWPoison(p); |
| 1223 | ret = 0; | 1256 | ret = 0; |
| 1224 | } else { | 1257 | } else { |
| 1225 | pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", | 1258 | pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", |
| 1226 | pfn, p->flags); | 1259 | pfn, p->flags); |
| 1227 | ret = -EIO; | 1260 | ret = -EIO; |
| 1228 | } | 1261 | } |
| @@ -1235,6 +1268,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
| 1235 | return ret; | 1268 | return ret; |
| 1236 | } | 1269 | } |
| 1237 | 1270 | ||
| 1271 | static int soft_offline_huge_page(struct page *page, int flags) | ||
| 1272 | { | ||
| 1273 | int ret; | ||
| 1274 | unsigned long pfn = page_to_pfn(page); | ||
| 1275 | struct page *hpage = compound_head(page); | ||
| 1276 | LIST_HEAD(pagelist); | ||
| 1277 | |||
| 1278 | ret = get_any_page(page, pfn, flags); | ||
| 1279 | if (ret < 0) | ||
| 1280 | return ret; | ||
| 1281 | if (ret == 0) | ||
| 1282 | goto done; | ||
| 1283 | |||
| 1284 | if (PageHWPoison(hpage)) { | ||
| 1285 | put_page(hpage); | ||
| 1286 | pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); | ||
| 1287 | return -EBUSY; | ||
| 1288 | } | ||
| 1289 | |||
| 1290 | /* Keep page count to indicate a given hugepage is isolated. */ | ||
| 1291 | |||
| 1292 | list_add(&hpage->lru, &pagelist); | ||
| 1293 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | ||
| 1294 | if (ret) { | ||
| 1295 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | ||
| 1296 | pfn, ret, page->flags); | ||
| 1297 | if (ret > 0) | ||
| 1298 | ret = -EIO; | ||
| 1299 | return ret; | ||
| 1300 | } | ||
| 1301 | done: | ||
| 1302 | if (!PageHWPoison(hpage)) | ||
| 1303 | atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); | ||
| 1304 | set_page_hwpoison_huge_page(hpage); | ||
| 1305 | dequeue_hwpoisoned_huge_page(hpage); | ||
| 1306 | /* keep elevated page count for bad page */ | ||
| 1307 | return ret; | ||
| 1308 | } | ||
| 1309 | |||
| 1238 | /** | 1310 | /** |
| 1239 | * soft_offline_page - Soft offline a page. | 1311 | * soft_offline_page - Soft offline a page. |
| 1240 | * @page: page to offline | 1312 | * @page: page to offline |
| @@ -1262,6 +1334,9 @@ int soft_offline_page(struct page *page, int flags) | |||
| 1262 | int ret; | 1334 | int ret; |
| 1263 | unsigned long pfn = page_to_pfn(page); | 1335 | unsigned long pfn = page_to_pfn(page); |
| 1264 | 1336 | ||
| 1337 | if (PageHuge(page)) | ||
| 1338 | return soft_offline_huge_page(page, flags); | ||
| 1339 | |||
| 1265 | ret = get_any_page(page, pfn, flags); | 1340 | ret = get_any_page(page, pfn, flags); |
| 1266 | if (ret < 0) | 1341 | if (ret < 0) |
| 1267 | return ret; | 1342 | return ret; |
| @@ -1288,7 +1363,7 @@ int soft_offline_page(struct page *page, int flags) | |||
| 1288 | goto done; | 1363 | goto done; |
| 1289 | } | 1364 | } |
| 1290 | if (!PageLRU(page)) { | 1365 | if (!PageLRU(page)) { |
| 1291 | pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", | 1366 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", |
| 1292 | pfn, page->flags); | 1367 | pfn, page->flags); |
| 1293 | return -EIO; | 1368 | return -EIO; |
| 1294 | } | 1369 | } |
| @@ -1302,7 +1377,7 @@ int soft_offline_page(struct page *page, int flags) | |||
| 1302 | if (PageHWPoison(page)) { | 1377 | if (PageHWPoison(page)) { |
| 1303 | unlock_page(page); | 1378 | unlock_page(page); |
| 1304 | put_page(page); | 1379 | put_page(page); |
| 1305 | pr_debug("soft offline: %#lx page already poisoned\n", pfn); | 1380 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
| 1306 | return -EBUSY; | 1381 | return -EBUSY; |
| 1307 | } | 1382 | } |
| 1308 | 1383 | ||
| @@ -1323,7 +1398,7 @@ int soft_offline_page(struct page *page, int flags) | |||
| 1323 | put_page(page); | 1398 | put_page(page); |
| 1324 | if (ret == 1) { | 1399 | if (ret == 1) { |
| 1325 | ret = 0; | 1400 | ret = 0; |
| 1326 | pr_debug("soft_offline: %#lx: invalidated\n", pfn); | 1401 | pr_info("soft_offline: %#lx: invalidated\n", pfn); |
| 1327 | goto done; | 1402 | goto done; |
| 1328 | } | 1403 | } |
| 1329 | 1404 | ||
| @@ -1339,13 +1414,13 @@ int soft_offline_page(struct page *page, int flags) | |||
| 1339 | list_add(&page->lru, &pagelist); | 1414 | list_add(&page->lru, &pagelist); |
| 1340 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | 1415 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); |
| 1341 | if (ret) { | 1416 | if (ret) { |
| 1342 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | 1417 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
| 1343 | pfn, ret, page->flags); | 1418 | pfn, ret, page->flags); |
| 1344 | if (ret > 0) | 1419 | if (ret > 0) |
| 1345 | ret = -EIO; | 1420 | ret = -EIO; |
| 1346 | } | 1421 | } |
| 1347 | } else { | 1422 | } else { |
| 1348 | pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | 1423 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", |
| 1349 | pfn, ret, page_count(page), page->flags); | 1424 | pfn, ret, page_count(page), page->flags); |
| 1350 | } | 1425 | } |
| 1351 | if (ret) | 1426 | if (ret) |
diff --git a/mm/memory.c b/mm/memory.c index 98b58fecedef..af82741caaa4 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1450 | if (ret & VM_FAULT_OOM) | 1450 | if (ret & VM_FAULT_OOM) |
| 1451 | return i ? i : -ENOMEM; | 1451 | return i ? i : -ENOMEM; |
| 1452 | if (ret & | 1452 | if (ret & |
| 1453 | (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) | 1453 | (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| |
| 1454 | VM_FAULT_SIGBUS)) | ||
| 1454 | return i ? i : -EFAULT; | 1455 | return i ? i : -EFAULT; |
| 1455 | BUG(); | 1456 | BUG(); |
| 1456 | } | 1457 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 38e7cad782f4..f8c9bccf2520 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -32,6 +32,7 @@ | |||
| 32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
| 33 | #include <linux/memcontrol.h> | 33 | #include <linux/memcontrol.h> |
| 34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
| 35 | #include <linux/hugetlb.h> | ||
| 35 | #include <linux/gfp.h> | 36 | #include <linux/gfp.h> |
| 36 | 37 | ||
| 37 | #include "internal.h" | 38 | #include "internal.h" |
| @@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
| 95 | pte_t *ptep, pte; | 96 | pte_t *ptep, pte; |
| 96 | spinlock_t *ptl; | 97 | spinlock_t *ptl; |
| 97 | 98 | ||
| 98 | pgd = pgd_offset(mm, addr); | 99 | if (unlikely(PageHuge(new))) { |
| 99 | if (!pgd_present(*pgd)) | 100 | ptep = huge_pte_offset(mm, addr); |
| 100 | goto out; | 101 | if (!ptep) |
| 102 | goto out; | ||
| 103 | ptl = &mm->page_table_lock; | ||
| 104 | } else { | ||
| 105 | pgd = pgd_offset(mm, addr); | ||
| 106 | if (!pgd_present(*pgd)) | ||
| 107 | goto out; | ||
| 101 | 108 | ||
| 102 | pud = pud_offset(pgd, addr); | 109 | pud = pud_offset(pgd, addr); |
| 103 | if (!pud_present(*pud)) | 110 | if (!pud_present(*pud)) |
| 104 | goto out; | 111 | goto out; |
| 105 | 112 | ||
| 106 | pmd = pmd_offset(pud, addr); | 113 | pmd = pmd_offset(pud, addr); |
| 107 | if (!pmd_present(*pmd)) | 114 | if (!pmd_present(*pmd)) |
| 108 | goto out; | 115 | goto out; |
| 109 | 116 | ||
| 110 | ptep = pte_offset_map(pmd, addr); | 117 | ptep = pte_offset_map(pmd, addr); |
| 111 | 118 | ||
| 112 | if (!is_swap_pte(*ptep)) { | 119 | if (!is_swap_pte(*ptep)) { |
| 113 | pte_unmap(ptep); | 120 | pte_unmap(ptep); |
| 114 | goto out; | 121 | goto out; |
| 115 | } | 122 | } |
| 123 | |||
| 124 | ptl = pte_lockptr(mm, pmd); | ||
| 125 | } | ||
| 116 | 126 | ||
| 117 | ptl = pte_lockptr(mm, pmd); | ||
| 118 | spin_lock(ptl); | 127 | spin_lock(ptl); |
| 119 | pte = *ptep; | 128 | pte = *ptep; |
| 120 | if (!is_swap_pte(pte)) | 129 | if (!is_swap_pte(pte)) |
| @@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
| 130 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 139 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
| 131 | if (is_write_migration_entry(entry)) | 140 | if (is_write_migration_entry(entry)) |
| 132 | pte = pte_mkwrite(pte); | 141 | pte = pte_mkwrite(pte); |
| 142 | #ifdef CONFIG_HUGETLB_PAGE | ||
| 143 | if (PageHuge(new)) | ||
| 144 | pte = pte_mkhuge(pte); | ||
| 145 | #endif | ||
| 133 | flush_cache_page(vma, addr, pte_pfn(pte)); | 146 | flush_cache_page(vma, addr, pte_pfn(pte)); |
| 134 | set_pte_at(mm, addr, ptep, pte); | 147 | set_pte_at(mm, addr, ptep, pte); |
| 135 | 148 | ||
| 136 | if (PageAnon(new)) | 149 | if (PageHuge(new)) { |
| 150 | if (PageAnon(new)) | ||
| 151 | hugepage_add_anon_rmap(new, vma, addr); | ||
| 152 | else | ||
| 153 | page_dup_rmap(new); | ||
| 154 | } else if (PageAnon(new)) | ||
| 137 | page_add_anon_rmap(new, vma, addr); | 155 | page_add_anon_rmap(new, vma, addr); |
| 138 | else | 156 | else |
| 139 | page_add_file_rmap(new); | 157 | page_add_file_rmap(new); |
| @@ -276,11 +294,59 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
| 276 | } | 294 | } |
| 277 | 295 | ||
| 278 | /* | 296 | /* |
| 297 | * The expected number of remaining references is the same as that | ||
| 298 | * of migrate_page_move_mapping(). | ||
| 299 | */ | ||
| 300 | int migrate_huge_page_move_mapping(struct address_space *mapping, | ||
| 301 | struct page *newpage, struct page *page) | ||
| 302 | { | ||
| 303 | int expected_count; | ||
| 304 | void **pslot; | ||
| 305 | |||
| 306 | if (!mapping) { | ||
| 307 | if (page_count(page) != 1) | ||
| 308 | return -EAGAIN; | ||
| 309 | return 0; | ||
| 310 | } | ||
| 311 | |||
| 312 | spin_lock_irq(&mapping->tree_lock); | ||
| 313 | |||
| 314 | pslot = radix_tree_lookup_slot(&mapping->page_tree, | ||
| 315 | page_index(page)); | ||
| 316 | |||
| 317 | expected_count = 2 + page_has_private(page); | ||
| 318 | if (page_count(page) != expected_count || | ||
| 319 | (struct page *)radix_tree_deref_slot(pslot) != page) { | ||
| 320 | spin_unlock_irq(&mapping->tree_lock); | ||
| 321 | return -EAGAIN; | ||
| 322 | } | ||
| 323 | |||
| 324 | if (!page_freeze_refs(page, expected_count)) { | ||
| 325 | spin_unlock_irq(&mapping->tree_lock); | ||
| 326 | return -EAGAIN; | ||
| 327 | } | ||
| 328 | |||
| 329 | get_page(newpage); | ||
| 330 | |||
| 331 | radix_tree_replace_slot(pslot, newpage); | ||
| 332 | |||
| 333 | page_unfreeze_refs(page, expected_count); | ||
| 334 | |||
| 335 | __put_page(page); | ||
| 336 | |||
| 337 | spin_unlock_irq(&mapping->tree_lock); | ||
| 338 | return 0; | ||
| 339 | } | ||
| 340 | |||
| 341 | /* | ||
| 279 | * Copy the page to its new location | 342 | * Copy the page to its new location |
| 280 | */ | 343 | */ |
| 281 | static void migrate_page_copy(struct page *newpage, struct page *page) | 344 | void migrate_page_copy(struct page *newpage, struct page *page) |
| 282 | { | 345 | { |
| 283 | copy_highpage(newpage, page); | 346 | if (PageHuge(page)) |
| 347 | copy_huge_page(newpage, page); | ||
| 348 | else | ||
| 349 | copy_highpage(newpage, page); | ||
| 284 | 350 | ||
| 285 | if (PageError(page)) | 351 | if (PageError(page)) |
| 286 | SetPageError(newpage); | 352 | SetPageError(newpage); |
| @@ -724,6 +790,92 @@ move_newpage: | |||
| 724 | } | 790 | } |
| 725 | 791 | ||
| 726 | /* | 792 | /* |
| 793 | * Counterpart of unmap_and_move_page() for hugepage migration. | ||
| 794 | * | ||
| 795 | * This function doesn't wait the completion of hugepage I/O | ||
| 796 | * because there is no race between I/O and migration for hugepage. | ||
| 797 | * Note that currently hugepage I/O occurs only in direct I/O | ||
| 798 | * where no lock is held and PG_writeback is irrelevant, | ||
| 799 | * and writeback status of all subpages are counted in the reference | ||
| 800 | * count of the head page (i.e. if all subpages of a 2MB hugepage are | ||
| 801 | * under direct I/O, the reference of the head page is 512 and a bit more.) | ||
| 802 | * This means that when we try to migrate hugepage whose subpages are | ||
| 803 | * doing direct I/O, some references remain after try_to_unmap() and | ||
| 804 | * hugepage migration fails without data corruption. | ||
| 805 | * | ||
| 806 | * There is also no race when direct I/O is issued on the page under migration, | ||
| 807 | * because then pte is replaced with migration swap entry and direct I/O code | ||
| 808 | * will wait in the page fault for migration to complete. | ||
| 809 | */ | ||
| 810 | static int unmap_and_move_huge_page(new_page_t get_new_page, | ||
| 811 | unsigned long private, struct page *hpage, | ||
| 812 | int force, int offlining) | ||
| 813 | { | ||
| 814 | int rc = 0; | ||
| 815 | int *result = NULL; | ||
| 816 | struct page *new_hpage = get_new_page(hpage, private, &result); | ||
| 817 | int rcu_locked = 0; | ||
| 818 | struct anon_vma *anon_vma = NULL; | ||
| 819 | |||
| 820 | if (!new_hpage) | ||
| 821 | return -ENOMEM; | ||
| 822 | |||
| 823 | rc = -EAGAIN; | ||
| 824 | |||
| 825 | if (!trylock_page(hpage)) { | ||
| 826 | if (!force) | ||
| 827 | goto out; | ||
| 828 | lock_page(hpage); | ||
| 829 | } | ||
| 830 | |||
| 831 | if (PageAnon(hpage)) { | ||
| 832 | rcu_read_lock(); | ||
| 833 | rcu_locked = 1; | ||
| 834 | |||
| 835 | if (page_mapped(hpage)) { | ||
| 836 | anon_vma = page_anon_vma(hpage); | ||
| 837 | atomic_inc(&anon_vma->external_refcount); | ||
| 838 | } | ||
| 839 | } | ||
| 840 | |||
| 841 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | ||
| 842 | |||
| 843 | if (!page_mapped(hpage)) | ||
| 844 | rc = move_to_new_page(new_hpage, hpage, 1); | ||
| 845 | |||
| 846 | if (rc) | ||
| 847 | remove_migration_ptes(hpage, hpage); | ||
| 848 | |||
| 849 | if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, | ||
| 850 | &anon_vma->lock)) { | ||
| 851 | int empty = list_empty(&anon_vma->head); | ||
| 852 | spin_unlock(&anon_vma->lock); | ||
| 853 | if (empty) | ||
| 854 | anon_vma_free(anon_vma); | ||
| 855 | } | ||
| 856 | |||
| 857 | if (rcu_locked) | ||
| 858 | rcu_read_unlock(); | ||
| 859 | out: | ||
| 860 | unlock_page(hpage); | ||
| 861 | |||
| 862 | if (rc != -EAGAIN) { | ||
| 863 | list_del(&hpage->lru); | ||
| 864 | put_page(hpage); | ||
| 865 | } | ||
| 866 | |||
| 867 | put_page(new_hpage); | ||
| 868 | |||
| 869 | if (result) { | ||
| 870 | if (rc) | ||
| 871 | *result = rc; | ||
| 872 | else | ||
| 873 | *result = page_to_nid(new_hpage); | ||
| 874 | } | ||
| 875 | return rc; | ||
| 876 | } | ||
| 877 | |||
| 878 | /* | ||
| 727 | * migrate_pages | 879 | * migrate_pages |
| 728 | * | 880 | * |
| 729 | * The function takes one list of pages to migrate and a function | 881 | * The function takes one list of pages to migrate and a function |
| @@ -788,6 +940,52 @@ out: | |||
| 788 | return nr_failed + retry; | 940 | return nr_failed + retry; |
| 789 | } | 941 | } |
| 790 | 942 | ||
| 943 | int migrate_huge_pages(struct list_head *from, | ||
| 944 | new_page_t get_new_page, unsigned long private, int offlining) | ||
| 945 | { | ||
| 946 | int retry = 1; | ||
| 947 | int nr_failed = 0; | ||
| 948 | int pass = 0; | ||
| 949 | struct page *page; | ||
| 950 | struct page *page2; | ||
| 951 | int rc; | ||
| 952 | |||
| 953 | for (pass = 0; pass < 10 && retry; pass++) { | ||
| 954 | retry = 0; | ||
| 955 | |||
| 956 | list_for_each_entry_safe(page, page2, from, lru) { | ||
| 957 | cond_resched(); | ||
| 958 | |||
| 959 | rc = unmap_and_move_huge_page(get_new_page, | ||
| 960 | private, page, pass > 2, offlining); | ||
| 961 | |||
| 962 | switch(rc) { | ||
| 963 | case -ENOMEM: | ||
| 964 | goto out; | ||
| 965 | case -EAGAIN: | ||
| 966 | retry++; | ||
| 967 | break; | ||
| 968 | case 0: | ||
| 969 | break; | ||
| 970 | default: | ||
| 971 | /* Permanent failure */ | ||
| 972 | nr_failed++; | ||
| 973 | break; | ||
| 974 | } | ||
| 975 | } | ||
| 976 | } | ||
| 977 | rc = 0; | ||
| 978 | out: | ||
| 979 | |||
| 980 | list_for_each_entry_safe(page, page2, from, lru) | ||
| 981 | put_page(page); | ||
| 982 | |||
| 983 | if (rc) | ||
| 984 | return rc; | ||
| 985 | |||
| 986 | return nr_failed + retry; | ||
| 987 | } | ||
| 988 | |||
| 791 | #ifdef CONFIG_NUMA | 989 | #ifdef CONFIG_NUMA |
| 792 | /* | 990 | /* |
| 793 | * Move a list of individual pages | 991 | * Move a list of individual pages |
| @@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page, | |||
| 780 | } | 780 | } |
| 781 | 781 | ||
| 782 | /** | 782 | /** |
| 783 | * __page_set_anon_rmap - setup new anonymous rmap | 783 | * __page_set_anon_rmap - set up new anonymous rmap |
| 784 | * @page: the page to add the mapping to | 784 | * @page: Page to add to rmap |
| 785 | * @vma: the vm area in which the mapping is added | 785 | * @vma: VM area to add page to. |
| 786 | * @address: the user virtual address mapped | 786 | * @address: User virtual address of the mapping |
| 787 | * @exclusive: the page is exclusively owned by the current process | 787 | * @exclusive: the page is exclusively owned by the current process |
| 788 | */ | 788 | */ |
| 789 | static void __page_set_anon_rmap(struct page *page, | 789 | static void __page_set_anon_rmap(struct page *page, |
| @@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page, | |||
| 793 | 793 | ||
| 794 | BUG_ON(!anon_vma); | 794 | BUG_ON(!anon_vma); |
| 795 | 795 | ||
| 796 | if (PageAnon(page)) | ||
| 797 | return; | ||
| 798 | |||
| 796 | /* | 799 | /* |
| 797 | * If the page isn't exclusively mapped into this vma, | 800 | * If the page isn't exclusively mapped into this vma, |
| 798 | * we must use the _oldest_ possible anon_vma for the | 801 | * we must use the _oldest_ possible anon_vma for the |
| 799 | * page mapping! | 802 | * page mapping! |
| 800 | */ | 803 | */ |
| 801 | if (!exclusive) { | 804 | if (!exclusive) |
| 802 | if (PageAnon(page)) | ||
| 803 | return; | ||
| 804 | anon_vma = anon_vma->root; | 805 | anon_vma = anon_vma->root; |
| 805 | } else { | ||
| 806 | /* | ||
| 807 | * In this case, swapped-out-but-not-discarded swap-cache | ||
| 808 | * is remapped. So, no need to update page->mapping here. | ||
| 809 | * We convice anon_vma poitned by page->mapping is not obsolete | ||
| 810 | * because vma->anon_vma is necessary to be a family of it. | ||
| 811 | */ | ||
| 812 | if (PageAnon(page)) | ||
| 813 | return; | ||
| 814 | } | ||
| 815 | 806 | ||
| 816 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 807 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
| 817 | page->mapping = (struct address_space *) anon_vma; | 808 | page->mapping = (struct address_space *) anon_vma; |
