aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/ia64/include/asm/siginfo.h1
-rw-r--r--arch/x86/mm/fault.c19
-rw-r--r--fs/hugetlbfs/inode.c15
-rw-r--r--fs/signalfd.c10
-rw-r--r--include/linux/hugetlb.h17
-rw-r--r--include/linux/migrate.h16
-rw-r--r--include/linux/mm.h12
-rw-r--r--include/linux/signalfd.h3
-rw-r--r--mm/hugetlb.c233
-rw-r--r--mm/memory-failure.c175
-rw-r--r--mm/memory.c3
-rw-r--r--mm/migrate.c234
-rw-r--r--mm/rmap.c25
13 files changed, 596 insertions, 167 deletions
diff --git a/arch/ia64/include/asm/siginfo.h b/arch/ia64/include/asm/siginfo.h
index 118d42979003..c8fcaa2ac48f 100644
--- a/arch/ia64/include/asm/siginfo.h
+++ b/arch/ia64/include/asm/siginfo.h
@@ -62,6 +62,7 @@ typedef struct siginfo {
62 int _imm; /* immediate value for "break" */ 62 int _imm; /* immediate value for "break" */
63 unsigned int _flags; /* see below */ 63 unsigned int _flags; /* see below */
64 unsigned long _isr; /* isr */ 64 unsigned long _isr; /* isr */
65 short _addr_lsb; /* lsb of faulting address */
65 } _sigfault; 66 } _sigfault;
66 67
67 /* SIGPOLL */ 68 /* SIGPOLL */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 79b0b372d2d0..852b319edbdc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -11,6 +11,7 @@
11#include <linux/kprobes.h> /* __kprobes, ... */ 11#include <linux/kprobes.h> /* __kprobes, ... */
12#include <linux/mmiotrace.h> /* kmmio_handler, ... */ 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/perf_event.h> /* perf_sw_event */ 13#include <linux/perf_event.h> /* perf_sw_event */
14#include <linux/hugetlb.h> /* hstate_index_to_shift */
14 15
15#include <asm/traps.h> /* dotraplinkage, ... */ 16#include <asm/traps.h> /* dotraplinkage, ... */
16#include <asm/pgalloc.h> /* pgd_*(), ... */ 17#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
160 161
161static void 162static void
162force_sig_info_fault(int si_signo, int si_code, unsigned long address, 163force_sig_info_fault(int si_signo, int si_code, unsigned long address,
163 struct task_struct *tsk) 164 struct task_struct *tsk, int fault)
164{ 165{
166 unsigned lsb = 0;
165 siginfo_t info; 167 siginfo_t info;
166 168
167 info.si_signo = si_signo; 169 info.si_signo = si_signo;
168 info.si_errno = 0; 170 info.si_errno = 0;
169 info.si_code = si_code; 171 info.si_code = si_code;
170 info.si_addr = (void __user *)address; 172 info.si_addr = (void __user *)address;
171 info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; 173 if (fault & VM_FAULT_HWPOISON_LARGE)
174 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
175 if (fault & VM_FAULT_HWPOISON)
176 lsb = PAGE_SHIFT;
177 info.si_addr_lsb = lsb;
172 178
173 force_sig_info(si_signo, &info, tsk); 179 force_sig_info(si_signo, &info, tsk);
174} 180}
@@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
722 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 728 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
723 tsk->thread.trap_no = 14; 729 tsk->thread.trap_no = 14;
724 730
725 force_sig_info_fault(SIGSEGV, si_code, address, tsk); 731 force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
726 732
727 return; 733 return;
728 } 734 }
@@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
807 tsk->thread.trap_no = 14; 813 tsk->thread.trap_no = 14;
808 814
809#ifdef CONFIG_MEMORY_FAILURE 815#ifdef CONFIG_MEMORY_FAILURE
810 if (fault & VM_FAULT_HWPOISON) { 816 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
811 printk(KERN_ERR 817 printk(KERN_ERR
812 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 818 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
813 tsk->comm, tsk->pid, address); 819 tsk->comm, tsk->pid, address);
814 code = BUS_MCEERR_AR; 820 code = BUS_MCEERR_AR;
815 } 821 }
816#endif 822#endif
817 force_sig_info_fault(SIGBUS, code, address, tsk); 823 force_sig_info_fault(SIGBUS, code, address, tsk, fault);
818} 824}
819 825
820static noinline void 826static noinline void
@@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
824 if (fault & VM_FAULT_OOM) { 830 if (fault & VM_FAULT_OOM) {
825 out_of_memory(regs, error_code, address); 831 out_of_memory(regs, error_code, address);
826 } else { 832 } else {
827 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) 833 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
834 VM_FAULT_HWPOISON_LARGE))
828 do_sigbus(regs, error_code, address, fault); 835 do_sigbus(regs, error_code, address, fault);
829 else 836 else
830 BUG(); 837 BUG();
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 113eba3d3c38..a14328d270e8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
31#include <linux/statfs.h> 31#include <linux/statfs.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/magic.h> 33#include <linux/magic.h>
34#include <linux/migrate.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36 37
@@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
573 return 0; 574 return 0;
574} 575}
575 576
577static int hugetlbfs_migrate_page(struct address_space *mapping,
578 struct page *newpage, struct page *page)
579{
580 int rc;
581
582 rc = migrate_huge_page_move_mapping(mapping, newpage, page);
583 if (rc)
584 return rc;
585 migrate_page_copy(newpage, page);
586
587 return 0;
588}
589
576static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 590static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
577{ 591{
578 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); 592 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = {
659 .write_begin = hugetlbfs_write_begin, 673 .write_begin = hugetlbfs_write_begin,
660 .write_end = hugetlbfs_write_end, 674 .write_end = hugetlbfs_write_end,
661 .set_page_dirty = hugetlbfs_set_page_dirty, 675 .set_page_dirty = hugetlbfs_set_page_dirty,
676 .migratepage = hugetlbfs_migrate_page,
662}; 677};
663 678
664 679
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 74047304b01a..492465b451dd 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -99,6 +99,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
99#ifdef __ARCH_SI_TRAPNO 99#ifdef __ARCH_SI_TRAPNO
100 err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); 100 err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
101#endif 101#endif
102#ifdef BUS_MCEERR_AO
103 /*
104 * Other callers might not initialize the si_lsb field,
105 * so check explicitly for the right codes here.
106 */
107 if (kinfo->si_code == BUS_MCEERR_AR ||
108 kinfo->si_code == BUS_MCEERR_AO)
109 err |= __put_user((short) kinfo->si_addr_lsb,
110 &uinfo->ssi_addr_lsb);
111#endif
102 break; 112 break;
103 case __SI_CHLD: 113 case __SI_CHLD:
104 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); 114 err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f479700df61b..943c76b3d4bb 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -43,7 +43,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
43 struct vm_area_struct *vma, 43 struct vm_area_struct *vma,
44 int acctflags); 44 int acctflags);
45void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); 45void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
46void __isolate_hwpoisoned_huge_page(struct page *page); 46int dequeue_hwpoisoned_huge_page(struct page *page);
47void copy_huge_page(struct page *dst, struct page *src);
47 48
48extern unsigned long hugepages_treat_as_movable; 49extern unsigned long hugepages_treat_as_movable;
49extern const unsigned long hugetlb_zero, hugetlb_infinity; 50extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -101,7 +102,10 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
101#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) 102#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
102#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) 103#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
103#define huge_pte_offset(mm, address) 0 104#define huge_pte_offset(mm, address) 0
104#define __isolate_hwpoisoned_huge_page(page) 0 105#define dequeue_hwpoisoned_huge_page(page) 0
106static inline void copy_huge_page(struct page *dst, struct page *src)
107{
108}
105 109
106#define hugetlb_change_protection(vma, address, end, newprot) 110#define hugetlb_change_protection(vma, address, end, newprot)
107 111
@@ -228,6 +232,8 @@ struct huge_bootmem_page {
228 struct hstate *hstate; 232 struct hstate *hstate;
229}; 233};
230 234
235struct page *alloc_huge_page_node(struct hstate *h, int nid);
236
231/* arch callback */ 237/* arch callback */
232int __init alloc_bootmem_huge_page(struct hstate *h); 238int __init alloc_bootmem_huge_page(struct hstate *h);
233 239
@@ -301,8 +307,14 @@ static inline struct hstate *page_hstate(struct page *page)
301 return size_to_hstate(PAGE_SIZE << compound_order(page)); 307 return size_to_hstate(PAGE_SIZE << compound_order(page));
302} 308}
303 309
310static inline unsigned hstate_index_to_shift(unsigned index)
311{
312 return hstates[index].order + PAGE_SHIFT;
313}
314
304#else 315#else
305struct hstate {}; 316struct hstate {};
317#define alloc_huge_page_node(h, nid) NULL
306#define alloc_bootmem_huge_page(h) NULL 318#define alloc_bootmem_huge_page(h) NULL
307#define hstate_file(f) NULL 319#define hstate_file(f) NULL
308#define hstate_vma(v) NULL 320#define hstate_vma(v) NULL
@@ -317,6 +329,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
317{ 329{
318 return 1; 330 return 1;
319} 331}
332#define hstate_index_to_shift(index) 0
320#endif 333#endif
321 334
322#endif /* _LINUX_HUGETLB_H */ 335#endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 7238231b8dd4..085527fb8261 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *,
14 struct page *, struct page *); 14 struct page *, struct page *);
15extern int migrate_pages(struct list_head *l, new_page_t x, 15extern int migrate_pages(struct list_head *l, new_page_t x,
16 unsigned long private, int offlining); 16 unsigned long private, int offlining);
17extern int migrate_huge_pages(struct list_head *l, new_page_t x,
18 unsigned long private, int offlining);
17 19
18extern int fail_migrate_page(struct address_space *, 20extern int fail_migrate_page(struct address_space *,
19 struct page *, struct page *); 21 struct page *, struct page *);
@@ -23,12 +25,17 @@ extern int migrate_prep_local(void);
23extern int migrate_vmas(struct mm_struct *mm, 25extern int migrate_vmas(struct mm_struct *mm,
24 const nodemask_t *from, const nodemask_t *to, 26 const nodemask_t *from, const nodemask_t *to,
25 unsigned long flags); 27 unsigned long flags);
28extern void migrate_page_copy(struct page *newpage, struct page *page);
29extern int migrate_huge_page_move_mapping(struct address_space *mapping,
30 struct page *newpage, struct page *page);
26#else 31#else
27#define PAGE_MIGRATION 0 32#define PAGE_MIGRATION 0
28 33
29static inline void putback_lru_pages(struct list_head *l) {} 34static inline void putback_lru_pages(struct list_head *l) {}
30static inline int migrate_pages(struct list_head *l, new_page_t x, 35static inline int migrate_pages(struct list_head *l, new_page_t x,
31 unsigned long private, int offlining) { return -ENOSYS; } 36 unsigned long private, int offlining) { return -ENOSYS; }
37static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
38 unsigned long private, int offlining) { return -ENOSYS; }
32 39
33static inline int migrate_prep(void) { return -ENOSYS; } 40static inline int migrate_prep(void) { return -ENOSYS; }
34static inline int migrate_prep_local(void) { return -ENOSYS; } 41static inline int migrate_prep_local(void) { return -ENOSYS; }
@@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm,
40 return -ENOSYS; 47 return -ENOSYS;
41} 48}
42 49
50static inline void migrate_page_copy(struct page *newpage,
51 struct page *page) {}
52
53static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
54 struct page *newpage, struct page *page)
55{
56 return -ENOSYS;
57}
58
43/* Possible settings for the migrate_page() method in address_operations */ 59/* Possible settings for the migrate_page() method in address_operations */
44#define migrate_page NULL 60#define migrate_page NULL
45#define fail_migrate_page NULL 61#define fail_migrate_page NULL
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7687228dd3b7..a4c66846fb8f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -718,12 +718,20 @@ static inline int page_mapped(struct page *page)
718#define VM_FAULT_SIGBUS 0x0002 718#define VM_FAULT_SIGBUS 0x0002
719#define VM_FAULT_MAJOR 0x0004 719#define VM_FAULT_MAJOR 0x0004
720#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ 720#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
721#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */ 721#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
722#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
722 723
723#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ 724#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
724#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ 725#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
725 726
726#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON) 727#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
728
729#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
730 VM_FAULT_HWPOISON_LARGE)
731
732/* Encode hstate index for a hwpoisoned large page */
733#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
734#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
727 735
728/* 736/*
729 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. 737 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h
index b363b916c909..3ff4961da9b5 100644
--- a/include/linux/signalfd.h
+++ b/include/linux/signalfd.h
@@ -33,6 +33,7 @@ struct signalfd_siginfo {
33 __u64 ssi_utime; 33 __u64 ssi_utime;
34 __u64 ssi_stime; 34 __u64 ssi_stime;
35 __u64 ssi_addr; 35 __u64 ssi_addr;
36 __u16 ssi_addr_lsb;
36 37
37 /* 38 /*
38 * Pad strcture to 128 bytes. Remember to update the 39 * Pad strcture to 128 bytes. Remember to update the
@@ -43,7 +44,7 @@ struct signalfd_siginfo {
43 * comes out of a read(2) and we really don't want to have 44 * comes out of a read(2) and we really don't want to have
44 * a compat on read(2). 45 * a compat on read(2).
45 */ 46 */
46 __u8 __pad[48]; 47 __u8 __pad[46];
47}; 48};
48 49
49 50
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c03273807182..96991ded82fe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page,
423 } 423 }
424} 424}
425 425
426static void copy_gigantic_page(struct page *dst, struct page *src, 426static void copy_user_gigantic_page(struct page *dst, struct page *src,
427 unsigned long addr, struct vm_area_struct *vma) 427 unsigned long addr, struct vm_area_struct *vma)
428{ 428{
429 int i; 429 int i;
430 struct hstate *h = hstate_vma(vma); 430 struct hstate *h = hstate_vma(vma);
431 struct page *dst_base = dst; 431 struct page *dst_base = dst;
432 struct page *src_base = src; 432 struct page *src_base = src;
433 might_sleep(); 433
434 for (i = 0; i < pages_per_huge_page(h); ) { 434 for (i = 0; i < pages_per_huge_page(h); ) {
435 cond_resched(); 435 cond_resched();
436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); 436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src,
440 src = mem_map_next(src, src_base, i); 440 src = mem_map_next(src, src_base, i);
441 } 441 }
442} 442}
443static void copy_huge_page(struct page *dst, struct page *src, 443
444static void copy_user_huge_page(struct page *dst, struct page *src,
444 unsigned long addr, struct vm_area_struct *vma) 445 unsigned long addr, struct vm_area_struct *vma)
445{ 446{
446 int i; 447 int i;
447 struct hstate *h = hstate_vma(vma); 448 struct hstate *h = hstate_vma(vma);
448 449
449 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { 450 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
450 copy_gigantic_page(dst, src, addr, vma); 451 copy_user_gigantic_page(dst, src, addr, vma);
451 return; 452 return;
452 } 453 }
453 454
@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src,
458 } 459 }
459} 460}
460 461
462static void copy_gigantic_page(struct page *dst, struct page *src)
463{
464 int i;
465 struct hstate *h = page_hstate(src);
466 struct page *dst_base = dst;
467 struct page *src_base = src;
468
469 for (i = 0; i < pages_per_huge_page(h); ) {
470 cond_resched();
471 copy_highpage(dst, src);
472
473 i++;
474 dst = mem_map_next(dst, dst_base, i);
475 src = mem_map_next(src, src_base, i);
476 }
477}
478
479void copy_huge_page(struct page *dst, struct page *src)
480{
481 int i;
482 struct hstate *h = page_hstate(src);
483
484 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
485 copy_gigantic_page(dst, src);
486 return;
487 }
488
489 might_sleep();
490 for (i = 0; i < pages_per_huge_page(h); i++) {
491 cond_resched();
492 copy_highpage(dst + i, src + i);
493 }
494}
495
461static void enqueue_huge_page(struct hstate *h, struct page *page) 496static void enqueue_huge_page(struct hstate *h, struct page *page)
462{ 497{
463 int nid = page_to_nid(page); 498 int nid = page_to_nid(page);
@@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
466 h->free_huge_pages_node[nid]++; 501 h->free_huge_pages_node[nid]++;
467} 502}
468 503
504static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
505{
506 struct page *page;
507
508 if (list_empty(&h->hugepage_freelists[nid]))
509 return NULL;
510 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
511 list_del(&page->lru);
512 set_page_refcounted(page);
513 h->free_huge_pages--;
514 h->free_huge_pages_node[nid]--;
515 return page;
516}
517
469static struct page *dequeue_huge_page_vma(struct hstate *h, 518static struct page *dequeue_huge_page_vma(struct hstate *h,
470 struct vm_area_struct *vma, 519 struct vm_area_struct *vma,
471 unsigned long address, int avoid_reserve) 520 unsigned long address, int avoid_reserve)
472{ 521{
473 int nid;
474 struct page *page = NULL; 522 struct page *page = NULL;
475 struct mempolicy *mpol; 523 struct mempolicy *mpol;
476 nodemask_t *nodemask; 524 nodemask_t *nodemask;
@@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
496 544
497 for_each_zone_zonelist_nodemask(zone, z, zonelist, 545 for_each_zone_zonelist_nodemask(zone, z, zonelist,
498 MAX_NR_ZONES - 1, nodemask) { 546 MAX_NR_ZONES - 1, nodemask) {
499 nid = zone_to_nid(zone); 547 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
500 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 548 page = dequeue_huge_page_node(h, zone_to_nid(zone));
501 !list_empty(&h->hugepage_freelists[nid])) { 549 if (page) {
502 page = list_entry(h->hugepage_freelists[nid].next, 550 if (!avoid_reserve)
503 struct page, lru); 551 decrement_hugepage_resv_vma(h, vma);
504 list_del(&page->lru); 552 break;
505 h->free_huge_pages--; 553 }
506 h->free_huge_pages_node[nid]--;
507
508 if (!avoid_reserve)
509 decrement_hugepage_resv_vma(h, vma);
510
511 break;
512 } 554 }
513 } 555 }
514err: 556err:
@@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
770 return ret; 812 return ret;
771} 813}
772 814
773static struct page *alloc_buddy_huge_page(struct hstate *h, 815static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
774 struct vm_area_struct *vma, unsigned long address)
775{ 816{
776 struct page *page; 817 struct page *page;
777 unsigned int nid; 818 unsigned int r_nid;
778 819
779 if (h->order >= MAX_ORDER) 820 if (h->order >= MAX_ORDER)
780 return NULL; 821 return NULL;
@@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
812 } 853 }
813 spin_unlock(&hugetlb_lock); 854 spin_unlock(&hugetlb_lock);
814 855
815 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 856 if (nid == NUMA_NO_NODE)
816 __GFP_REPEAT|__GFP_NOWARN, 857 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
817 huge_page_order(h)); 858 __GFP_REPEAT|__GFP_NOWARN,
859 huge_page_order(h));
860 else
861 page = alloc_pages_exact_node(nid,
862 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
863 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
818 864
819 if (page && arch_prepare_hugepage(page)) { 865 if (page && arch_prepare_hugepage(page)) {
820 __free_pages(page, huge_page_order(h)); 866 __free_pages(page, huge_page_order(h));
@@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
823 869
824 spin_lock(&hugetlb_lock); 870 spin_lock(&hugetlb_lock);
825 if (page) { 871 if (page) {
826 /* 872 r_nid = page_to_nid(page);
827 * This page is now managed by the hugetlb allocator and has
828 * no users -- drop the buddy allocator's reference.
829 */
830 put_page_testzero(page);
831 VM_BUG_ON(page_count(page));
832 nid = page_to_nid(page);
833 set_compound_page_dtor(page, free_huge_page); 873 set_compound_page_dtor(page, free_huge_page);
834 /* 874 /*
835 * We incremented the global counters already 875 * We incremented the global counters already
836 */ 876 */
837 h->nr_huge_pages_node[nid]++; 877 h->nr_huge_pages_node[r_nid]++;
838 h->surplus_huge_pages_node[nid]++; 878 h->surplus_huge_pages_node[r_nid]++;
839 __count_vm_event(HTLB_BUDDY_PGALLOC); 879 __count_vm_event(HTLB_BUDDY_PGALLOC);
840 } else { 880 } else {
841 h->nr_huge_pages--; 881 h->nr_huge_pages--;
@@ -848,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
848} 888}
849 889
850/* 890/*
891 * This allocation function is useful in the context where vma is irrelevant.
892 * E.g. soft-offlining uses this function because it only cares physical
893 * address of error page.
894 */
895struct page *alloc_huge_page_node(struct hstate *h, int nid)
896{
897 struct page *page;
898
899 spin_lock(&hugetlb_lock);
900 page = dequeue_huge_page_node(h, nid);
901 spin_unlock(&hugetlb_lock);
902
903 if (!page)
904 page = alloc_buddy_huge_page(h, nid);
905
906 return page;
907}
908
909/*
851 * Increase the hugetlb pool such that it can accomodate a reservation 910 * Increase the hugetlb pool such that it can accomodate a reservation
852 * of size 'delta'. 911 * of size 'delta'.
853 */ 912 */
@@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
871retry: 930retry:
872 spin_unlock(&hugetlb_lock); 931 spin_unlock(&hugetlb_lock);
873 for (i = 0; i < needed; i++) { 932 for (i = 0; i < needed; i++) {
874 page = alloc_buddy_huge_page(h, NULL, 0); 933 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
875 if (!page) { 934 if (!page)
876 /* 935 /*
877 * We were not able to allocate enough pages to 936 * We were not able to allocate enough pages to
878 * satisfy the entire reservation so we free what 937 * satisfy the entire reservation so we free what
879 * we've allocated so far. 938 * we've allocated so far.
880 */ 939 */
881 spin_lock(&hugetlb_lock);
882 needed = 0;
883 goto free; 940 goto free;
884 }
885 941
886 list_add(&page->lru, &surplus_list); 942 list_add(&page->lru, &surplus_list);
887 } 943 }
@@ -908,31 +964,31 @@ retry:
908 needed += allocated; 964 needed += allocated;
909 h->resv_huge_pages += delta; 965 h->resv_huge_pages += delta;
910 ret = 0; 966 ret = 0;
911free: 967
968 spin_unlock(&hugetlb_lock);
912 /* Free the needed pages to the hugetlb pool */ 969 /* Free the needed pages to the hugetlb pool */
913 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 970 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
914 if ((--needed) < 0) 971 if ((--needed) < 0)
915 break; 972 break;
916 list_del(&page->lru); 973 list_del(&page->lru);
974 /*
975 * This page is now managed by the hugetlb allocator and has
976 * no users -- drop the buddy allocator's reference.
977 */
978 put_page_testzero(page);
979 VM_BUG_ON(page_count(page));
917 enqueue_huge_page(h, page); 980 enqueue_huge_page(h, page);
918 } 981 }
919 982
920 /* Free unnecessary surplus pages to the buddy allocator */ 983 /* Free unnecessary surplus pages to the buddy allocator */
984free:
921 if (!list_empty(&surplus_list)) { 985 if (!list_empty(&surplus_list)) {
922 spin_unlock(&hugetlb_lock);
923 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 986 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
924 list_del(&page->lru); 987 list_del(&page->lru);
925 /* 988 put_page(page);
926 * The page has a reference count of zero already, so
927 * call free_huge_page directly instead of using
928 * put_page. This must be done with hugetlb_lock
929 * unlocked which is safe because free_huge_page takes
930 * hugetlb_lock before deciding how to free the page.
931 */
932 free_huge_page(page);
933 } 989 }
934 spin_lock(&hugetlb_lock);
935 } 990 }
991 spin_lock(&hugetlb_lock);
936 992
937 return ret; 993 return ret;
938} 994}
@@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1052 spin_unlock(&hugetlb_lock); 1108 spin_unlock(&hugetlb_lock);
1053 1109
1054 if (!page) { 1110 if (!page) {
1055 page = alloc_buddy_huge_page(h, vma, addr); 1111 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1056 if (!page) { 1112 if (!page) {
1057 hugetlb_put_quota(inode->i_mapping, chg); 1113 hugetlb_put_quota(inode->i_mapping, chg);
1058 return ERR_PTR(-VM_FAULT_SIGBUS); 1114 return ERR_PTR(-VM_FAULT_SIGBUS);
1059 } 1115 }
1060 } 1116 }
1061 1117
1062 set_page_refcounted(page);
1063 set_page_private(page, (unsigned long) mapping); 1118 set_page_private(page, (unsigned long) mapping);
1064 1119
1065 vma_commit_reservation(h, vma, addr); 1120 vma_commit_reservation(h, vma, addr);
@@ -2153,6 +2208,19 @@ nomem:
2153 return -ENOMEM; 2208 return -ENOMEM;
2154} 2209}
2155 2210
2211static int is_hugetlb_entry_migration(pte_t pte)
2212{
2213 swp_entry_t swp;
2214
2215 if (huge_pte_none(pte) || pte_present(pte))
2216 return 0;
2217 swp = pte_to_swp_entry(pte);
2218 if (non_swap_entry(swp) && is_migration_entry(swp)) {
2219 return 1;
2220 } else
2221 return 0;
2222}
2223
2156static int is_hugetlb_entry_hwpoisoned(pte_t pte) 2224static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2157{ 2225{
2158 swp_entry_t swp; 2226 swp_entry_t swp;
@@ -2383,7 +2451,7 @@ retry_avoidcopy:
2383 if (unlikely(anon_vma_prepare(vma))) 2451 if (unlikely(anon_vma_prepare(vma)))
2384 return VM_FAULT_OOM; 2452 return VM_FAULT_OOM;
2385 2453
2386 copy_huge_page(new_page, old_page, address, vma); 2454 copy_user_huge_page(new_page, old_page, address, vma);
2387 __SetPageUptodate(new_page); 2455 __SetPageUptodate(new_page);
2388 2456
2389 /* 2457 /*
@@ -2515,22 +2583,20 @@ retry:
2515 hugepage_add_new_anon_rmap(page, vma, address); 2583 hugepage_add_new_anon_rmap(page, vma, address);
2516 } 2584 }
2517 } else { 2585 } else {
2586 /*
2587 * If memory error occurs between mmap() and fault, some process
2588 * don't have hwpoisoned swap entry for errored virtual address.
2589 * So we need to block hugepage fault by PG_hwpoison bit check.
2590 */
2591 if (unlikely(PageHWPoison(page))) {
2592 ret = VM_FAULT_HWPOISON |
2593 VM_FAULT_SET_HINDEX(h - hstates);
2594 goto backout_unlocked;
2595 }
2518 page_dup_rmap(page); 2596 page_dup_rmap(page);
2519 } 2597 }
2520 2598
2521 /* 2599 /*
2522 * Since memory error handler replaces pte into hwpoison swap entry
2523 * at the time of error handling, a process which reserved but not have
2524 * the mapping to the error hugepage does not have hwpoison swap entry.
2525 * So we need to block accesses from such a process by checking
2526 * PG_hwpoison bit here.
2527 */
2528 if (unlikely(PageHWPoison(page))) {
2529 ret = VM_FAULT_HWPOISON;
2530 goto backout_unlocked;
2531 }
2532
2533 /*
2534 * If we are going to COW a private mapping later, we examine the 2600 * If we are going to COW a private mapping later, we examine the
2535 * pending reservations for this page now. This will ensure that 2601 * pending reservations for this page now. This will ensure that
2536 * any allocations necessary to record that reservation occur outside 2602 * any allocations necessary to record that reservation occur outside
@@ -2587,8 +2653,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2587 ptep = huge_pte_offset(mm, address); 2653 ptep = huge_pte_offset(mm, address);
2588 if (ptep) { 2654 if (ptep) {
2589 entry = huge_ptep_get(ptep); 2655 entry = huge_ptep_get(ptep);
2590 if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2656 if (unlikely(is_hugetlb_entry_migration(entry))) {
2591 return VM_FAULT_HWPOISON; 2657 migration_entry_wait(mm, (pmd_t *)ptep, address);
2658 return 0;
2659 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2660 return VM_FAULT_HWPOISON_LARGE |
2661 VM_FAULT_SET_HINDEX(h - hstates);
2592 } 2662 }
2593 2663
2594 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2664 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2878,18 +2948,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2878 hugetlb_acct_memory(h, -(chg - freed)); 2948 hugetlb_acct_memory(h, -(chg - freed));
2879} 2949}
2880 2950
2951#ifdef CONFIG_MEMORY_FAILURE
2952
2953/* Should be called in hugetlb_lock */
2954static int is_hugepage_on_freelist(struct page *hpage)
2955{
2956 struct page *page;
2957 struct page *tmp;
2958 struct hstate *h = page_hstate(hpage);
2959 int nid = page_to_nid(hpage);
2960
2961 list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
2962 if (page == hpage)
2963 return 1;
2964 return 0;
2965}
2966
2881/* 2967/*
2882 * This function is called from memory failure code. 2968 * This function is called from memory failure code.
2883 * Assume the caller holds page lock of the head page. 2969 * Assume the caller holds page lock of the head page.
2884 */ 2970 */
2885void __isolate_hwpoisoned_huge_page(struct page *hpage) 2971int dequeue_hwpoisoned_huge_page(struct page *hpage)
2886{ 2972{
2887 struct hstate *h = page_hstate(hpage); 2973 struct hstate *h = page_hstate(hpage);
2888 int nid = page_to_nid(hpage); 2974 int nid = page_to_nid(hpage);
2975 int ret = -EBUSY;
2889 2976
2890 spin_lock(&hugetlb_lock); 2977 spin_lock(&hugetlb_lock);
2891 list_del(&hpage->lru); 2978 if (is_hugepage_on_freelist(hpage)) {
2892 h->free_huge_pages--; 2979 list_del(&hpage->lru);
2893 h->free_huge_pages_node[nid]--; 2980 set_page_refcounted(hpage);
2981 h->free_huge_pages--;
2982 h->free_huge_pages_node[nid]--;
2983 ret = 0;
2984 }
2894 spin_unlock(&hugetlb_lock); 2985 spin_unlock(&hugetlb_lock);
2986 return ret;
2895} 2987}
2988#endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 757f6b0accfe..44a8cefeae6e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
7 * Free Software Foundation. 7 * Free Software Foundation.
8 * 8 *
9 * High level machine check handler. Handles pages reported by the 9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache 10 * hardware as being corrupted usually due to a multi-bit ECC memory or cache
11 * failure. 11 * failure.
12 *
13 * In addition there is a "soft offline" entry point that allows stop using
14 * not-yet-corrupted-by-suspicious pages without killing anything.
12 * 15 *
13 * Handles page cache pages in various states. The tricky part 16 * Handles page cache pages in various states. The tricky part
14 * here is that we can access any page asynchronous to other VM 17 * here is that we can access any page asynchronously in respect to
15 * users, because memory failures could happen anytime and anywhere, 18 * other VM users, because memory failures could happen anytime and
16 * possibly violating some of their assumptions. This is why this code 19 * anywhere. This could violate some of their assumptions. This is why
17 * has to be extremely careful. Generally it tries to use normal locking 20 * this code has to be extremely careful. Generally it tries to use
18 * rules, as in get the standard locks, even if that means the 21 * normal locking rules, as in get the standard locks, even if that means
19 * error handling takes potentially a long time. 22 * the error handling takes potentially a long time.
20 * 23 *
21 * The operation to map back from RMAP chains to processes has to walk 24 * There are several operations here with exponential complexity because
22 * the complete process list and has non linear complexity with the number 25 * of unsuitable VM data structures. For example the operation to map back
23 * mappings. In short it can be quite slow. But since memory corruptions 26 * from RMAP chains to processes has to walk the complete process list and
24 * are rare we hope to get away with this. 27 * has non linear complexity with the number. But since memory corruptions
28 * are rare we hope to get away with this. This avoids impacting the core
29 * VM.
25 */ 30 */
26 31
27/* 32/*
@@ -30,7 +35,6 @@
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages 35 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel 36 * - pass bad pages to kdump next kernel
32 */ 37 */
33#define DEBUG 1 /* remove me in 2.6.34 */
34#include <linux/kernel.h> 38#include <linux/kernel.h>
35#include <linux/mm.h> 39#include <linux/mm.h>
36#include <linux/page-flags.h> 40#include <linux/page-flags.h>
@@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p)
78 return 0; 82 return 0;
79 83
80 /* 84 /*
81 * page_mapping() does not accept slab page 85 * page_mapping() does not accept slab pages.
82 */ 86 */
83 if (PageSlab(p)) 87 if (PageSlab(p))
84 return -EINVAL; 88 return -EINVAL;
@@ -268,7 +272,7 @@ struct to_kill {
268 struct list_head nd; 272 struct list_head nd;
269 struct task_struct *tsk; 273 struct task_struct *tsk;
270 unsigned long addr; 274 unsigned long addr;
271 unsigned addr_valid:1; 275 char addr_valid;
272}; 276};
273 277
274/* 278/*
@@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
309 * a SIGKILL because the error is not contained anymore. 313 * a SIGKILL because the error is not contained anymore.
310 */ 314 */
311 if (tk->addr == -EFAULT) { 315 if (tk->addr == -EFAULT) {
312 pr_debug("MCE: Unable to find user space address %lx in %s\n", 316 pr_info("MCE: Unable to find user space address %lx in %s\n",
313 page_to_pfn(p), tsk->comm); 317 page_to_pfn(p), tsk->comm);
314 tk->addr_valid = 0; 318 tk->addr_valid = 0;
315 } 319 }
@@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
577 pfn, err); 581 pfn, err);
578 } else if (page_has_private(p) && 582 } else if (page_has_private(p) &&
579 !try_to_release_page(p, GFP_NOIO)) { 583 !try_to_release_page(p, GFP_NOIO)) {
580 pr_debug("MCE %#lx: failed to release buffers\n", pfn); 584 pr_info("MCE %#lx: failed to release buffers\n", pfn);
581 } else { 585 } else {
582 ret = RECOVERED; 586 ret = RECOVERED;
583 } 587 }
@@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
693 * Issues: 697 * Issues:
694 * - Error on hugepage is contained in hugepage unit (not in raw page unit.) 698 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
695 * To narrow down kill region to one page, we need to break up pmd. 699 * To narrow down kill region to one page, we need to break up pmd.
696 * - To support soft-offlining for hugepage, we need to support hugepage
697 * migration.
698 */ 700 */
699static int me_huge_page(struct page *p, unsigned long pfn) 701static int me_huge_page(struct page *p, unsigned long pfn)
700{ 702{
703 int res = 0;
701 struct page *hpage = compound_head(p); 704 struct page *hpage = compound_head(p);
702 /* 705 /*
703 * We can safely recover from error on free or reserved (i.e. 706 * We can safely recover from error on free or reserved (i.e.
@@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
710 * so there is no race between isolation and mapping/unmapping. 713 * so there is no race between isolation and mapping/unmapping.
711 */ 714 */
712 if (!(page_mapping(hpage) || PageAnon(hpage))) { 715 if (!(page_mapping(hpage) || PageAnon(hpage))) {
713 __isolate_hwpoisoned_huge_page(hpage); 716 res = dequeue_hwpoisoned_huge_page(hpage);
714 return RECOVERED; 717 if (!res)
718 return RECOVERED;
715 } 719 }
716 return DELAYED; 720 return DELAYED;
717} 721}
@@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p,
836 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; 840 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
837} 841}
838 842
839#define N_UNMAP_TRIES 5
840
841/* 843/*
842 * Do all that is necessary to remove user space mappings. Unmap 844 * Do all that is necessary to remove user space mappings. Unmap
843 * the pages and send SIGBUS to the processes if the data was dirty. 845 * the pages and send SIGBUS to the processes if the data was dirty.
@@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
849 struct address_space *mapping; 851 struct address_space *mapping;
850 LIST_HEAD(tokill); 852 LIST_HEAD(tokill);
851 int ret; 853 int ret;
852 int i;
853 int kill = 1; 854 int kill = 1;
854 struct page *hpage = compound_head(p); 855 struct page *hpage = compound_head(p);
855 856
@@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
903 if (kill) 904 if (kill)
904 collect_procs(hpage, &tokill); 905 collect_procs(hpage, &tokill);
905 906
906 /* 907 ret = try_to_unmap(hpage, ttu);
907 * try_to_unmap can fail temporarily due to races.
908 * Try a few times (RED-PEN better strategy?)
909 */
910 for (i = 0; i < N_UNMAP_TRIES; i++) {
911 ret = try_to_unmap(hpage, ttu);
912 if (ret == SWAP_SUCCESS)
913 break;
914 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
915 }
916
917 if (ret != SWAP_SUCCESS) 908 if (ret != SWAP_SUCCESS)
918 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 909 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
919 pfn, page_mapcount(hpage)); 910 pfn, page_mapcount(hpage));
@@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
981 * We need/can do nothing about count=0 pages. 972 * We need/can do nothing about count=0 pages.
982 * 1) it's a free page, and therefore in safe hand: 973 * 1) it's a free page, and therefore in safe hand:
983 * prep_new_page() will be the gate keeper. 974 * prep_new_page() will be the gate keeper.
984 * 2) it's part of a non-compound high order page. 975 * 2) it's a free hugepage, which is also safe:
976 * an affected hugepage will be dequeued from hugepage freelist,
977 * so there's no concern about reusing it ever after.
978 * 3) it's part of a non-compound high order page.
985 * Implies some kernel user: cannot stop them from 979 * Implies some kernel user: cannot stop them from
986 * R/W the page; let's pray that the page has been 980 * R/W the page; let's pray that the page has been
987 * used and will be freed some time later. 981 * used and will be freed some time later.
@@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
993 if (is_free_buddy_page(p)) { 987 if (is_free_buddy_page(p)) {
994 action_result(pfn, "free buddy", DELAYED); 988 action_result(pfn, "free buddy", DELAYED);
995 return 0; 989 return 0;
990 } else if (PageHuge(hpage)) {
991 /*
992 * Check "just unpoisoned", "filter hit", and
993 * "race with other subpage."
994 */
995 lock_page_nosync(hpage);
996 if (!PageHWPoison(hpage)
997 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
998 || (p != hpage && TestSetPageHWPoison(hpage))) {
999 atomic_long_sub(nr_pages, &mce_bad_pages);
1000 return 0;
1001 }
1002 set_page_hwpoison_huge_page(hpage);
1003 res = dequeue_hwpoisoned_huge_page(hpage);
1004 action_result(pfn, "free huge",
1005 res ? IGNORED : DELAYED);
1006 unlock_page(hpage);
1007 return res;
996 } else { 1008 } else {
997 action_result(pfn, "high order kernel", IGNORED); 1009 action_result(pfn, "high order kernel", IGNORED);
998 return -EBUSY; 1010 return -EBUSY;
@@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn)
1147 page = compound_head(p); 1159 page = compound_head(p);
1148 1160
1149 if (!PageHWPoison(p)) { 1161 if (!PageHWPoison(p)) {
1150 pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); 1162 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1151 return 0; 1163 return 0;
1152 } 1164 }
1153 1165
1154 nr_pages = 1 << compound_order(page); 1166 nr_pages = 1 << compound_order(page);
1155 1167
1156 if (!get_page_unless_zero(page)) { 1168 if (!get_page_unless_zero(page)) {
1169 /*
1170 * Since HWPoisoned hugepage should have non-zero refcount,
1171 * race between memory failure and unpoison seems to happen.
1172 * In such case unpoison fails and memory failure runs
1173 * to the end.
1174 */
1175 if (PageHuge(page)) {
1176 pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1177 return 0;
1178 }
1157 if (TestClearPageHWPoison(p)) 1179 if (TestClearPageHWPoison(p))
1158 atomic_long_sub(nr_pages, &mce_bad_pages); 1180 atomic_long_sub(nr_pages, &mce_bad_pages);
1159 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 1181 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1160 return 0; 1182 return 0;
1161 } 1183 }
1162 1184
@@ -1168,12 +1190,12 @@ int unpoison_memory(unsigned long pfn)
1168 * the free buddy page pool. 1190 * the free buddy page pool.
1169 */ 1191 */
1170 if (TestClearPageHWPoison(page)) { 1192 if (TestClearPageHWPoison(page)) {
1171 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 1193 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1172 atomic_long_sub(nr_pages, &mce_bad_pages); 1194 atomic_long_sub(nr_pages, &mce_bad_pages);
1173 freeit = 1; 1195 freeit = 1;
1196 if (PageHuge(page))
1197 clear_page_hwpoison_huge_page(page);
1174 } 1198 }
1175 if (PageHuge(p))
1176 clear_page_hwpoison_huge_page(page);
1177 unlock_page(page); 1199 unlock_page(page);
1178 1200
1179 put_page(page); 1201 put_page(page);
@@ -1187,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory);
1187static struct page *new_page(struct page *p, unsigned long private, int **x) 1209static struct page *new_page(struct page *p, unsigned long private, int **x)
1188{ 1210{
1189 int nid = page_to_nid(p); 1211 int nid = page_to_nid(p);
1190 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); 1212 if (PageHuge(p))
1213 return alloc_huge_page_node(page_hstate(compound_head(p)),
1214 nid);
1215 else
1216 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1191} 1217}
1192 1218
1193/* 1219/*
@@ -1215,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1215 * was free. 1241 * was free.
1216 */ 1242 */
1217 set_migratetype_isolate(p); 1243 set_migratetype_isolate(p);
1244 /*
1245 * When the target page is a free hugepage, just remove it
1246 * from free hugepage list.
1247 */
1218 if (!get_page_unless_zero(compound_head(p))) { 1248 if (!get_page_unless_zero(compound_head(p))) {
1219 if (is_free_buddy_page(p)) { 1249 if (PageHuge(p)) {
1220 pr_debug("get_any_page: %#lx free buddy page\n", pfn); 1250 pr_info("get_any_page: %#lx free huge page\n", pfn);
1251 ret = dequeue_hwpoisoned_huge_page(compound_head(p));
1252 } else if (is_free_buddy_page(p)) {
1253 pr_info("get_any_page: %#lx free buddy page\n", pfn);
1221 /* Set hwpoison bit while page is still isolated */ 1254 /* Set hwpoison bit while page is still isolated */
1222 SetPageHWPoison(p); 1255 SetPageHWPoison(p);
1223 ret = 0; 1256 ret = 0;
1224 } else { 1257 } else {
1225 pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", 1258 pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1226 pfn, p->flags); 1259 pfn, p->flags);
1227 ret = -EIO; 1260 ret = -EIO;
1228 } 1261 }
@@ -1235,6 +1268,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1235 return ret; 1268 return ret;
1236} 1269}
1237 1270
1271static int soft_offline_huge_page(struct page *page, int flags)
1272{
1273 int ret;
1274 unsigned long pfn = page_to_pfn(page);
1275 struct page *hpage = compound_head(page);
1276 LIST_HEAD(pagelist);
1277
1278 ret = get_any_page(page, pfn, flags);
1279 if (ret < 0)
1280 return ret;
1281 if (ret == 0)
1282 goto done;
1283
1284 if (PageHWPoison(hpage)) {
1285 put_page(hpage);
1286 pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
1287 return -EBUSY;
1288 }
1289
1290 /* Keep page count to indicate a given hugepage is isolated. */
1291
1292 list_add(&hpage->lru, &pagelist);
1293 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1294 if (ret) {
1295 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1296 pfn, ret, page->flags);
1297 if (ret > 0)
1298 ret = -EIO;
1299 return ret;
1300 }
1301done:
1302 if (!PageHWPoison(hpage))
1303 atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
1304 set_page_hwpoison_huge_page(hpage);
1305 dequeue_hwpoisoned_huge_page(hpage);
1306 /* keep elevated page count for bad page */
1307 return ret;
1308}
1309
1238/** 1310/**
1239 * soft_offline_page - Soft offline a page. 1311 * soft_offline_page - Soft offline a page.
1240 * @page: page to offline 1312 * @page: page to offline
@@ -1262,6 +1334,9 @@ int soft_offline_page(struct page *page, int flags)
1262 int ret; 1334 int ret;
1263 unsigned long pfn = page_to_pfn(page); 1335 unsigned long pfn = page_to_pfn(page);
1264 1336
1337 if (PageHuge(page))
1338 return soft_offline_huge_page(page, flags);
1339
1265 ret = get_any_page(page, pfn, flags); 1340 ret = get_any_page(page, pfn, flags);
1266 if (ret < 0) 1341 if (ret < 0)
1267 return ret; 1342 return ret;
@@ -1288,7 +1363,7 @@ int soft_offline_page(struct page *page, int flags)
1288 goto done; 1363 goto done;
1289 } 1364 }
1290 if (!PageLRU(page)) { 1365 if (!PageLRU(page)) {
1291 pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", 1366 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1292 pfn, page->flags); 1367 pfn, page->flags);
1293 return -EIO; 1368 return -EIO;
1294 } 1369 }
@@ -1302,7 +1377,7 @@ int soft_offline_page(struct page *page, int flags)
1302 if (PageHWPoison(page)) { 1377 if (PageHWPoison(page)) {
1303 unlock_page(page); 1378 unlock_page(page);
1304 put_page(page); 1379 put_page(page);
1305 pr_debug("soft offline: %#lx page already poisoned\n", pfn); 1380 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1306 return -EBUSY; 1381 return -EBUSY;
1307 } 1382 }
1308 1383
@@ -1323,7 +1398,7 @@ int soft_offline_page(struct page *page, int flags)
1323 put_page(page); 1398 put_page(page);
1324 if (ret == 1) { 1399 if (ret == 1) {
1325 ret = 0; 1400 ret = 0;
1326 pr_debug("soft_offline: %#lx: invalidated\n", pfn); 1401 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1327 goto done; 1402 goto done;
1328 } 1403 }
1329 1404
@@ -1339,13 +1414,13 @@ int soft_offline_page(struct page *page, int flags)
1339 list_add(&page->lru, &pagelist); 1414 list_add(&page->lru, &pagelist);
1340 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1415 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1341 if (ret) { 1416 if (ret) {
1342 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1417 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1343 pfn, ret, page->flags); 1418 pfn, ret, page->flags);
1344 if (ret > 0) 1419 if (ret > 0)
1345 ret = -EIO; 1420 ret = -EIO;
1346 } 1421 }
1347 } else { 1422 } else {
1348 pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1423 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1349 pfn, ret, page_count(page), page->flags); 1424 pfn, ret, page_count(page), page->flags);
1350 } 1425 }
1351 if (ret) 1426 if (ret)
diff --git a/mm/memory.c b/mm/memory.c
index 98b58fecedef..af82741caaa4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1450 if (ret & VM_FAULT_OOM) 1450 if (ret & VM_FAULT_OOM)
1451 return i ? i : -ENOMEM; 1451 return i ? i : -ENOMEM;
1452 if (ret & 1452 if (ret &
1453 (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) 1453 (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
1454 VM_FAULT_SIGBUS))
1454 return i ? i : -EFAULT; 1455 return i ? i : -EFAULT;
1455 BUG(); 1456 BUG();
1456 } 1457 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 38e7cad782f4..f8c9bccf2520 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,7 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/memcontrol.h> 33#include <linux/memcontrol.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/hugetlb.h>
35#include <linux/gfp.h> 36#include <linux/gfp.h>
36 37
37#include "internal.h" 38#include "internal.h"
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
95 pte_t *ptep, pte; 96 pte_t *ptep, pte;
96 spinlock_t *ptl; 97 spinlock_t *ptl;
97 98
98 pgd = pgd_offset(mm, addr); 99 if (unlikely(PageHuge(new))) {
99 if (!pgd_present(*pgd)) 100 ptep = huge_pte_offset(mm, addr);
100 goto out; 101 if (!ptep)
102 goto out;
103 ptl = &mm->page_table_lock;
104 } else {
105 pgd = pgd_offset(mm, addr);
106 if (!pgd_present(*pgd))
107 goto out;
101 108
102 pud = pud_offset(pgd, addr); 109 pud = pud_offset(pgd, addr);
103 if (!pud_present(*pud)) 110 if (!pud_present(*pud))
104 goto out; 111 goto out;
105 112
106 pmd = pmd_offset(pud, addr); 113 pmd = pmd_offset(pud, addr);
107 if (!pmd_present(*pmd)) 114 if (!pmd_present(*pmd))
108 goto out; 115 goto out;
109 116
110 ptep = pte_offset_map(pmd, addr); 117 ptep = pte_offset_map(pmd, addr);
111 118
112 if (!is_swap_pte(*ptep)) { 119 if (!is_swap_pte(*ptep)) {
113 pte_unmap(ptep); 120 pte_unmap(ptep);
114 goto out; 121 goto out;
115 } 122 }
123
124 ptl = pte_lockptr(mm, pmd);
125 }
116 126
117 ptl = pte_lockptr(mm, pmd);
118 spin_lock(ptl); 127 spin_lock(ptl);
119 pte = *ptep; 128 pte = *ptep;
120 if (!is_swap_pte(pte)) 129 if (!is_swap_pte(pte))
@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
130 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 139 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
131 if (is_write_migration_entry(entry)) 140 if (is_write_migration_entry(entry))
132 pte = pte_mkwrite(pte); 141 pte = pte_mkwrite(pte);
142#ifdef CONFIG_HUGETLB_PAGE
143 if (PageHuge(new))
144 pte = pte_mkhuge(pte);
145#endif
133 flush_cache_page(vma, addr, pte_pfn(pte)); 146 flush_cache_page(vma, addr, pte_pfn(pte));
134 set_pte_at(mm, addr, ptep, pte); 147 set_pte_at(mm, addr, ptep, pte);
135 148
136 if (PageAnon(new)) 149 if (PageHuge(new)) {
150 if (PageAnon(new))
151 hugepage_add_anon_rmap(new, vma, addr);
152 else
153 page_dup_rmap(new);
154 } else if (PageAnon(new))
137 page_add_anon_rmap(new, vma, addr); 155 page_add_anon_rmap(new, vma, addr);
138 else 156 else
139 page_add_file_rmap(new); 157 page_add_file_rmap(new);
@@ -276,11 +294,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
276} 294}
277 295
278/* 296/*
297 * The expected number of remaining references is the same as that
298 * of migrate_page_move_mapping().
299 */
300int migrate_huge_page_move_mapping(struct address_space *mapping,
301 struct page *newpage, struct page *page)
302{
303 int expected_count;
304 void **pslot;
305
306 if (!mapping) {
307 if (page_count(page) != 1)
308 return -EAGAIN;
309 return 0;
310 }
311
312 spin_lock_irq(&mapping->tree_lock);
313
314 pslot = radix_tree_lookup_slot(&mapping->page_tree,
315 page_index(page));
316
317 expected_count = 2 + page_has_private(page);
318 if (page_count(page) != expected_count ||
319 (struct page *)radix_tree_deref_slot(pslot) != page) {
320 spin_unlock_irq(&mapping->tree_lock);
321 return -EAGAIN;
322 }
323
324 if (!page_freeze_refs(page, expected_count)) {
325 spin_unlock_irq(&mapping->tree_lock);
326 return -EAGAIN;
327 }
328
329 get_page(newpage);
330
331 radix_tree_replace_slot(pslot, newpage);
332
333 page_unfreeze_refs(page, expected_count);
334
335 __put_page(page);
336
337 spin_unlock_irq(&mapping->tree_lock);
338 return 0;
339}
340
341/*
279 * Copy the page to its new location 342 * Copy the page to its new location
280 */ 343 */
281static void migrate_page_copy(struct page *newpage, struct page *page) 344void migrate_page_copy(struct page *newpage, struct page *page)
282{ 345{
283 copy_highpage(newpage, page); 346 if (PageHuge(page))
347 copy_huge_page(newpage, page);
348 else
349 copy_highpage(newpage, page);
284 350
285 if (PageError(page)) 351 if (PageError(page))
286 SetPageError(newpage); 352 SetPageError(newpage);
@@ -724,6 +790,92 @@ move_newpage:
724} 790}
725 791
726/* 792/*
793 * Counterpart of unmap_and_move_page() for hugepage migration.
794 *
795 * This function doesn't wait the completion of hugepage I/O
796 * because there is no race between I/O and migration for hugepage.
797 * Note that currently hugepage I/O occurs only in direct I/O
798 * where no lock is held and PG_writeback is irrelevant,
799 * and writeback status of all subpages are counted in the reference
800 * count of the head page (i.e. if all subpages of a 2MB hugepage are
801 * under direct I/O, the reference of the head page is 512 and a bit more.)
802 * This means that when we try to migrate hugepage whose subpages are
803 * doing direct I/O, some references remain after try_to_unmap() and
804 * hugepage migration fails without data corruption.
805 *
806 * There is also no race when direct I/O is issued on the page under migration,
807 * because then pte is replaced with migration swap entry and direct I/O code
808 * will wait in the page fault for migration to complete.
809 */
810static int unmap_and_move_huge_page(new_page_t get_new_page,
811 unsigned long private, struct page *hpage,
812 int force, int offlining)
813{
814 int rc = 0;
815 int *result = NULL;
816 struct page *new_hpage = get_new_page(hpage, private, &result);
817 int rcu_locked = 0;
818 struct anon_vma *anon_vma = NULL;
819
820 if (!new_hpage)
821 return -ENOMEM;
822
823 rc = -EAGAIN;
824
825 if (!trylock_page(hpage)) {
826 if (!force)
827 goto out;
828 lock_page(hpage);
829 }
830
831 if (PageAnon(hpage)) {
832 rcu_read_lock();
833 rcu_locked = 1;
834
835 if (page_mapped(hpage)) {
836 anon_vma = page_anon_vma(hpage);
837 atomic_inc(&anon_vma->external_refcount);
838 }
839 }
840
841 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
842
843 if (!page_mapped(hpage))
844 rc = move_to_new_page(new_hpage, hpage, 1);
845
846 if (rc)
847 remove_migration_ptes(hpage, hpage);
848
849 if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
850 &anon_vma->lock)) {
851 int empty = list_empty(&anon_vma->head);
852 spin_unlock(&anon_vma->lock);
853 if (empty)
854 anon_vma_free(anon_vma);
855 }
856
857 if (rcu_locked)
858 rcu_read_unlock();
859out:
860 unlock_page(hpage);
861
862 if (rc != -EAGAIN) {
863 list_del(&hpage->lru);
864 put_page(hpage);
865 }
866
867 put_page(new_hpage);
868
869 if (result) {
870 if (rc)
871 *result = rc;
872 else
873 *result = page_to_nid(new_hpage);
874 }
875 return rc;
876}
877
878/*
727 * migrate_pages 879 * migrate_pages
728 * 880 *
729 * The function takes one list of pages to migrate and a function 881 * The function takes one list of pages to migrate and a function
@@ -788,6 +940,52 @@ out:
788 return nr_failed + retry; 940 return nr_failed + retry;
789} 941}
790 942
943int migrate_huge_pages(struct list_head *from,
944 new_page_t get_new_page, unsigned long private, int offlining)
945{
946 int retry = 1;
947 int nr_failed = 0;
948 int pass = 0;
949 struct page *page;
950 struct page *page2;
951 int rc;
952
953 for (pass = 0; pass < 10 && retry; pass++) {
954 retry = 0;
955
956 list_for_each_entry_safe(page, page2, from, lru) {
957 cond_resched();
958
959 rc = unmap_and_move_huge_page(get_new_page,
960 private, page, pass > 2, offlining);
961
962 switch(rc) {
963 case -ENOMEM:
964 goto out;
965 case -EAGAIN:
966 retry++;
967 break;
968 case 0:
969 break;
970 default:
971 /* Permanent failure */
972 nr_failed++;
973 break;
974 }
975 }
976 }
977 rc = 0;
978out:
979
980 list_for_each_entry_safe(page, page2, from, lru)
981 put_page(page);
982
983 if (rc)
984 return rc;
985
986 return nr_failed + retry;
987}
988
791#ifdef CONFIG_NUMA 989#ifdef CONFIG_NUMA
792/* 990/*
793 * Move a list of individual pages 991 * Move a list of individual pages
diff --git a/mm/rmap.c b/mm/rmap.c
index 5f17fad1bee8..f5ad996a4a8f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page,
780} 780}
781 781
782/** 782/**
783 * __page_set_anon_rmap - setup new anonymous rmap 783 * __page_set_anon_rmap - set up new anonymous rmap
784 * @page: the page to add the mapping to 784 * @page: Page to add to rmap
785 * @vma: the vm area in which the mapping is added 785 * @vma: VM area to add page to.
786 * @address: the user virtual address mapped 786 * @address: User virtual address of the mapping
787 * @exclusive: the page is exclusively owned by the current process 787 * @exclusive: the page is exclusively owned by the current process
788 */ 788 */
789static void __page_set_anon_rmap(struct page *page, 789static void __page_set_anon_rmap(struct page *page,
@@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page,
793 793
794 BUG_ON(!anon_vma); 794 BUG_ON(!anon_vma);
795 795
796 if (PageAnon(page))
797 return;
798
796 /* 799 /*
797 * If the page isn't exclusively mapped into this vma, 800 * If the page isn't exclusively mapped into this vma,
798 * we must use the _oldest_ possible anon_vma for the 801 * we must use the _oldest_ possible anon_vma for the
799 * page mapping! 802 * page mapping!
800 */ 803 */
801 if (!exclusive) { 804 if (!exclusive)
802 if (PageAnon(page))
803 return;
804 anon_vma = anon_vma->root; 805 anon_vma = anon_vma->root;
805 } else {
806 /*
807 * In this case, swapped-out-but-not-discarded swap-cache
808 * is remapped. So, no need to update page->mapping here.
809 * We convice anon_vma poitned by page->mapping is not obsolete
810 * because vma->anon_vma is necessary to be a family of it.
811 */
812 if (PageAnon(page))
813 return;
814 }
815 806
816 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 807 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
817 page->mapping = (struct address_space *) anon_vma; 808 page->mapping = (struct address_space *) anon_vma;