aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/vfs.txt7
-rw-r--r--Documentation/sysctl/vm.txt41
-rw-r--r--arch/x86/mm/fault.c19
-rw-r--r--fs/btrfs/inode.c1
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext3/inode.c3
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/gfs2/aops.c3
-rw-r--r--fs/nfs/file.c1
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ocfs2/aops.c1
-rw-r--r--fs/proc/meminfo.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1
-rw-r--r--include/asm-generic/mman-common.h1
-rw-r--r--include/asm-generic/siginfo.h8
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/mm.h15
-rw-r--r--include/linux/page-flags.h17
-rw-r--r--include/linux/prctl.h2
-rw-r--r--include/linux/rmap.h21
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/swap.h34
-rw-r--r--include/linux/swapops.h38
-rw-r--r--kernel/sys.c22
-rw-r--r--kernel/sysctl.c25
-rw-r--r--mm/Kconfig14
-rw-r--r--mm/Makefile2
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/hwpoison-inject.c41
-rw-r--r--mm/madvise.c30
-rw-r--r--mm/memory-failure.c832
-rw-r--r--mm/memory.c24
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/page-writeback.c7
-rw-r--r--mm/page_alloc.c20
-rw-r--r--mm/rmap.c60
-rw-r--r--mm/shmem.c5
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/truncate.c72
-rw-r--r--mm/vmscan.c2
40 files changed, 1331 insertions, 68 deletions
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index f49eecf2e573..623f094c9d8d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -536,6 +536,7 @@ struct address_space_operations {
536 /* migrate the contents of a page to the specified target */ 536 /* migrate the contents of a page to the specified target */
537 int (*migratepage) (struct page *, struct page *); 537 int (*migratepage) (struct page *, struct page *);
538 int (*launder_page) (struct page *); 538 int (*launder_page) (struct page *);
539 int (*error_remove_page) (struct mapping *mapping, struct page *page);
539}; 540};
540 541
541 writepage: called by the VM to write a dirty page to backing store. 542 writepage: called by the VM to write a dirty page to backing store.
@@ -694,6 +695,12 @@ struct address_space_operations {
694 prevent redirtying the page, it is kept locked during the whole 695 prevent redirtying the page, it is kept locked during the whole
695 operation. 696 operation.
696 697
698 error_remove_page: normally set to generic_error_remove_page if truncation
699 is ok for this address space. Used for memory failure handling.
700 Setting this implies you deal with pages going away under you,
701 unless you have them locked or reference counts increased.
702
703
697The File Object 704The File Object
698=============== 705===============
699 706
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index e6fb1ec2744b..a6e360d2055c 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/vm:
32- legacy_va_layout 32- legacy_va_layout
33- lowmem_reserve_ratio 33- lowmem_reserve_ratio
34- max_map_count 34- max_map_count
35- memory_failure_early_kill
36- memory_failure_recovery
35- min_free_kbytes 37- min_free_kbytes
36- min_slab_ratio 38- min_slab_ratio
37- min_unmapped_ratio 39- min_unmapped_ratio
@@ -53,7 +55,6 @@ Currently, these files are in /proc/sys/vm:
53- vfs_cache_pressure 55- vfs_cache_pressure
54- zone_reclaim_mode 56- zone_reclaim_mode
55 57
56
57============================================================== 58==============================================================
58 59
59block_dump 60block_dump
@@ -275,6 +276,44 @@ e.g., up to one or two maps per allocation.
275 276
276The default value is 65536. 277The default value is 65536.
277 278
279=============================================================
280
281memory_failure_early_kill:
282
283Control how to kill processes when uncorrected memory error (typically
284a 2bit error in a memory module) is detected in the background by hardware
285that cannot be handled by the kernel. In some cases (like the page
286still having a valid copy on disk) the kernel will handle the failure
287transparently without affecting any applications. But if there is
288no other uptodate copy of the data it will kill to prevent any data
289corruptions from propagating.
290
2911: Kill all processes that have the corrupted and not reloadable page mapped
292as soon as the corruption is detected. Note this is not supported
293for a few types of pages, like kernel internally allocated data or
294the swap cache, but works for the majority of user pages.
295
2960: Only unmap the corrupted page from all processes and only kill a process
297who tries to access it.
298
299The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
300handle this if they want to.
301
302This is only active on architectures/platforms with advanced machine
303check handling and depends on the hardware capabilities.
304
305Applications can override this setting individually with the PR_MCE_KILL prctl
306
307==============================================================
308
309memory_failure_recovery
310
311Enable memory failure recovery (when supported by the platform)
312
3131: Attempt recovery.
314
3150: Always panic on a memory failure.
316
278============================================================== 317==============================================================
279 318
280min_free_kbytes: 319min_free_kbytes:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 82728f2c6d55..f4cee9028cf0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -167,6 +167,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
167 info.si_errno = 0; 167 info.si_errno = 0;
168 info.si_code = si_code; 168 info.si_code = si_code;
169 info.si_addr = (void __user *)address; 169 info.si_addr = (void __user *)address;
170 info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
170 171
171 force_sig_info(si_signo, &info, tsk); 172 force_sig_info(si_signo, &info, tsk);
172} 173}
@@ -790,10 +791,12 @@ out_of_memory(struct pt_regs *regs, unsigned long error_code,
790} 791}
791 792
792static void 793static void
793do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) 794do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
795 unsigned int fault)
794{ 796{
795 struct task_struct *tsk = current; 797 struct task_struct *tsk = current;
796 struct mm_struct *mm = tsk->mm; 798 struct mm_struct *mm = tsk->mm;
799 int code = BUS_ADRERR;
797 800
798 up_read(&mm->mmap_sem); 801 up_read(&mm->mmap_sem);
799 802
@@ -809,7 +812,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
809 tsk->thread.error_code = error_code; 812 tsk->thread.error_code = error_code;
810 tsk->thread.trap_no = 14; 813 tsk->thread.trap_no = 14;
811 814
812 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 815#ifdef CONFIG_MEMORY_FAILURE
816 if (fault & VM_FAULT_HWPOISON) {
817 printk(KERN_ERR
818 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
819 tsk->comm, tsk->pid, address);
820 code = BUS_MCEERR_AR;
821 }
822#endif
823 force_sig_info_fault(SIGBUS, code, address, tsk);
813} 824}
814 825
815static noinline void 826static noinline void
@@ -819,8 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
819 if (fault & VM_FAULT_OOM) { 830 if (fault & VM_FAULT_OOM) {
820 out_of_memory(regs, error_code, address); 831 out_of_memory(regs, error_code, address);
821 } else { 832 } else {
822 if (fault & VM_FAULT_SIGBUS) 833 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
823 do_sigbus(regs, error_code, address); 834 do_sigbus(regs, error_code, address, fault);
824 else 835 else
825 BUG(); 836 BUG();
826 } 837 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9096fd0ca3ca..d154a3f365d5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5269,6 +5269,7 @@ static const struct address_space_operations btrfs_aops = {
5269 .invalidatepage = btrfs_invalidatepage, 5269 .invalidatepage = btrfs_invalidatepage,
5270 .releasepage = btrfs_releasepage, 5270 .releasepage = btrfs_releasepage,
5271 .set_page_dirty = btrfs_set_page_dirty, 5271 .set_page_dirty = btrfs_set_page_dirty,
5272 .error_remove_page = generic_error_remove_page,
5272}; 5273};
5273 5274
5274static const struct address_space_operations btrfs_symlink_aops = { 5275static const struct address_space_operations btrfs_symlink_aops = {
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 1c1638f873a4..ade634076d0a 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -819,6 +819,7 @@ const struct address_space_operations ext2_aops = {
819 .writepages = ext2_writepages, 819 .writepages = ext2_writepages,
820 .migratepage = buffer_migrate_page, 820 .migratepage = buffer_migrate_page,
821 .is_partially_uptodate = block_is_partially_uptodate, 821 .is_partially_uptodate = block_is_partially_uptodate,
822 .error_remove_page = generic_error_remove_page,
822}; 823};
823 824
824const struct address_space_operations ext2_aops_xip = { 825const struct address_space_operations ext2_aops_xip = {
@@ -837,6 +838,7 @@ const struct address_space_operations ext2_nobh_aops = {
837 .direct_IO = ext2_direct_IO, 838 .direct_IO = ext2_direct_IO,
838 .writepages = ext2_writepages, 839 .writepages = ext2_writepages,
839 .migratepage = buffer_migrate_page, 840 .migratepage = buffer_migrate_page,
841 .error_remove_page = generic_error_remove_page,
840}; 842};
841 843
842/* 844/*
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index cd098a7b77fc..acf1b1423327 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1830,6 +1830,7 @@ static const struct address_space_operations ext3_ordered_aops = {
1830 .direct_IO = ext3_direct_IO, 1830 .direct_IO = ext3_direct_IO,
1831 .migratepage = buffer_migrate_page, 1831 .migratepage = buffer_migrate_page,
1832 .is_partially_uptodate = block_is_partially_uptodate, 1832 .is_partially_uptodate = block_is_partially_uptodate,
1833 .error_remove_page = generic_error_remove_page,
1833}; 1834};
1834 1835
1835static const struct address_space_operations ext3_writeback_aops = { 1836static const struct address_space_operations ext3_writeback_aops = {
@@ -1845,6 +1846,7 @@ static const struct address_space_operations ext3_writeback_aops = {
1845 .direct_IO = ext3_direct_IO, 1846 .direct_IO = ext3_direct_IO,
1846 .migratepage = buffer_migrate_page, 1847 .migratepage = buffer_migrate_page,
1847 .is_partially_uptodate = block_is_partially_uptodate, 1848 .is_partially_uptodate = block_is_partially_uptodate,
1849 .error_remove_page = generic_error_remove_page,
1848}; 1850};
1849 1851
1850static const struct address_space_operations ext3_journalled_aops = { 1852static const struct address_space_operations ext3_journalled_aops = {
@@ -1859,6 +1861,7 @@ static const struct address_space_operations ext3_journalled_aops = {
1859 .invalidatepage = ext3_invalidatepage, 1861 .invalidatepage = ext3_invalidatepage,
1860 .releasepage = ext3_releasepage, 1862 .releasepage = ext3_releasepage,
1861 .is_partially_uptodate = block_is_partially_uptodate, 1863 .is_partially_uptodate = block_is_partially_uptodate,
1864 .error_remove_page = generic_error_remove_page,
1862}; 1865};
1863 1866
1864void ext3_set_aops(struct inode *inode) 1867void ext3_set_aops(struct inode *inode)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3a798737e305..064746fad581 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3386,6 +3386,7 @@ static const struct address_space_operations ext4_ordered_aops = {
3386 .direct_IO = ext4_direct_IO, 3386 .direct_IO = ext4_direct_IO,
3387 .migratepage = buffer_migrate_page, 3387 .migratepage = buffer_migrate_page,
3388 .is_partially_uptodate = block_is_partially_uptodate, 3388 .is_partially_uptodate = block_is_partially_uptodate,
3389 .error_remove_page = generic_error_remove_page,
3389}; 3390};
3390 3391
3391static const struct address_space_operations ext4_writeback_aops = { 3392static const struct address_space_operations ext4_writeback_aops = {
@@ -3401,6 +3402,7 @@ static const struct address_space_operations ext4_writeback_aops = {
3401 .direct_IO = ext4_direct_IO, 3402 .direct_IO = ext4_direct_IO,
3402 .migratepage = buffer_migrate_page, 3403 .migratepage = buffer_migrate_page,
3403 .is_partially_uptodate = block_is_partially_uptodate, 3404 .is_partially_uptodate = block_is_partially_uptodate,
3405 .error_remove_page = generic_error_remove_page,
3404}; 3406};
3405 3407
3406static const struct address_space_operations ext4_journalled_aops = { 3408static const struct address_space_operations ext4_journalled_aops = {
@@ -3415,6 +3417,7 @@ static const struct address_space_operations ext4_journalled_aops = {
3415 .invalidatepage = ext4_invalidatepage, 3417 .invalidatepage = ext4_invalidatepage,
3416 .releasepage = ext4_releasepage, 3418 .releasepage = ext4_releasepage,
3417 .is_partially_uptodate = block_is_partially_uptodate, 3419 .is_partially_uptodate = block_is_partially_uptodate,
3420 .error_remove_page = generic_error_remove_page,
3418}; 3421};
3419 3422
3420static const struct address_space_operations ext4_da_aops = { 3423static const struct address_space_operations ext4_da_aops = {
@@ -3431,6 +3434,7 @@ static const struct address_space_operations ext4_da_aops = {
3431 .direct_IO = ext4_direct_IO, 3434 .direct_IO = ext4_direct_IO,
3432 .migratepage = buffer_migrate_page, 3435 .migratepage = buffer_migrate_page,
3433 .is_partially_uptodate = block_is_partially_uptodate, 3436 .is_partially_uptodate = block_is_partially_uptodate,
3437 .error_remove_page = generic_error_remove_page,
3434}; 3438};
3435 3439
3436void ext4_set_aops(struct inode *inode) 3440void ext4_set_aops(struct inode *inode)
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7ebae9a4ecc0..694b5d48f036 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1135,6 +1135,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
1135 .direct_IO = gfs2_direct_IO, 1135 .direct_IO = gfs2_direct_IO,
1136 .migratepage = buffer_migrate_page, 1136 .migratepage = buffer_migrate_page,
1137 .is_partially_uptodate = block_is_partially_uptodate, 1137 .is_partially_uptodate = block_is_partially_uptodate,
1138 .error_remove_page = generic_error_remove_page,
1138}; 1139};
1139 1140
1140static const struct address_space_operations gfs2_ordered_aops = { 1141static const struct address_space_operations gfs2_ordered_aops = {
@@ -1151,6 +1152,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
1151 .direct_IO = gfs2_direct_IO, 1152 .direct_IO = gfs2_direct_IO,
1152 .migratepage = buffer_migrate_page, 1153 .migratepage = buffer_migrate_page,
1153 .is_partially_uptodate = block_is_partially_uptodate, 1154 .is_partially_uptodate = block_is_partially_uptodate,
1155 .error_remove_page = generic_error_remove_page,
1154}; 1156};
1155 1157
1156static const struct address_space_operations gfs2_jdata_aops = { 1158static const struct address_space_operations gfs2_jdata_aops = {
@@ -1166,6 +1168,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
1166 .invalidatepage = gfs2_invalidatepage, 1168 .invalidatepage = gfs2_invalidatepage,
1167 .releasepage = gfs2_releasepage, 1169 .releasepage = gfs2_releasepage,
1168 .is_partially_uptodate = block_is_partially_uptodate, 1170 .is_partially_uptodate = block_is_partially_uptodate,
1171 .error_remove_page = generic_error_remove_page,
1169}; 1172};
1170 1173
1171void gfs2_set_aops(struct inode *inode) 1174void gfs2_set_aops(struct inode *inode)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5021b75d2d1e..86d6b4db1096 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -525,6 +525,7 @@ const struct address_space_operations nfs_file_aops = {
525 .direct_IO = nfs_direct_IO, 525 .direct_IO = nfs_direct_IO,
526 .migratepage = nfs_migrate_page, 526 .migratepage = nfs_migrate_page,
527 .launder_page = nfs_launder_page, 527 .launder_page = nfs_launder_page,
528 .error_remove_page = generic_error_remove_page,
528}; 529};
529 530
530/* 531/*
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index b38f944f0667..cfce53cb65d7 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1550,6 +1550,7 @@ const struct address_space_operations ntfs_aops = {
1550 .migratepage = buffer_migrate_page, /* Move a page cache page from 1550 .migratepage = buffer_migrate_page, /* Move a page cache page from
1551 one physical page to an 1551 one physical page to an
1552 other. */ 1552 other. */
1553 .error_remove_page = generic_error_remove_page,
1553}; 1554};
1554 1555
1555/** 1556/**
@@ -1569,6 +1570,7 @@ const struct address_space_operations ntfs_mst_aops = {
1569 .migratepage = buffer_migrate_page, /* Move a page cache page from 1570 .migratepage = buffer_migrate_page, /* Move a page cache page from
1570 one physical page to an 1571 one physical page to an
1571 other. */ 1572 other. */
1573 .error_remove_page = generic_error_remove_page,
1572}; 1574};
1573 1575
1574#ifdef NTFS_RW 1576#ifdef NTFS_RW
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 72e76062a900..deb2b132ae5e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2022,4 +2022,5 @@ const struct address_space_operations ocfs2_aops = {
2022 .releasepage = ocfs2_releasepage, 2022 .releasepage = ocfs2_releasepage,
2023 .migratepage = buffer_migrate_page, 2023 .migratepage = buffer_migrate_page,
2024 .is_partially_uptodate = block_is_partially_uptodate, 2024 .is_partially_uptodate = block_is_partially_uptodate,
2025 .error_remove_page = generic_error_remove_page,
2025}; 2026};
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 171e052c07b3..c7bff4f603ff 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -97,7 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
97 "Committed_AS: %8lu kB\n" 97 "Committed_AS: %8lu kB\n"
98 "VmallocTotal: %8lu kB\n" 98 "VmallocTotal: %8lu kB\n"
99 "VmallocUsed: %8lu kB\n" 99 "VmallocUsed: %8lu kB\n"
100 "VmallocChunk: %8lu kB\n", 100 "VmallocChunk: %8lu kB\n"
101#ifdef CONFIG_MEMORY_FAILURE
102 "HardwareCorrupted: %8lu kB\n"
103#endif
104 ,
101 K(i.totalram), 105 K(i.totalram),
102 K(i.freeram), 106 K(i.freeram),
103 K(i.bufferram), 107 K(i.bufferram),
@@ -144,6 +148,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
144 (unsigned long)VMALLOC_TOTAL >> 10, 148 (unsigned long)VMALLOC_TOTAL >> 10,
145 vmi.used >> 10, 149 vmi.used >> 10,
146 vmi.largest_chunk >> 10 150 vmi.largest_chunk >> 10
151#ifdef CONFIG_MEMORY_FAILURE
152 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
153#endif
147 ); 154 );
148 155
149 hugetlb_report_meminfo(m); 156 hugetlb_report_meminfo(m);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d5e5559e31db..381854461b28 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1635,4 +1635,5 @@ const struct address_space_operations xfs_address_space_operations = {
1635 .direct_IO = xfs_vm_direct_IO, 1635 .direct_IO = xfs_vm_direct_IO,
1636 .migratepage = buffer_migrate_page, 1636 .migratepage = buffer_migrate_page,
1637 .is_partially_uptodate = block_is_partially_uptodate, 1637 .is_partially_uptodate = block_is_partially_uptodate,
1638 .error_remove_page = generic_error_remove_page,
1638}; 1639};
diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h
index dd63bd38864b..5ee13b2fd223 100644
--- a/include/asm-generic/mman-common.h
+++ b/include/asm-generic/mman-common.h
@@ -34,6 +34,7 @@
34#define MADV_REMOVE 9 /* remove these pages & resources */ 34#define MADV_REMOVE 9 /* remove these pages & resources */
35#define MADV_DONTFORK 10 /* don't inherit across fork */ 35#define MADV_DONTFORK 10 /* don't inherit across fork */
36#define MADV_DOFORK 11 /* do inherit across fork */ 36#define MADV_DOFORK 11 /* do inherit across fork */
37#define MADV_HWPOISON 100 /* poison a page for testing */
37 38
38#define MADV_MERGEABLE 12 /* KSM may merge identical pages */ 39#define MADV_MERGEABLE 12 /* KSM may merge identical pages */
39#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ 40#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */
diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index c840719a8c59..942d30b5aab1 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -82,6 +82,7 @@ typedef struct siginfo {
82#ifdef __ARCH_SI_TRAPNO 82#ifdef __ARCH_SI_TRAPNO
83 int _trapno; /* TRAP # which caused the signal */ 83 int _trapno; /* TRAP # which caused the signal */
84#endif 84#endif
85 short _addr_lsb; /* LSB of the reported address */
85 } _sigfault; 86 } _sigfault;
86 87
87 /* SIGPOLL */ 88 /* SIGPOLL */
@@ -112,6 +113,7 @@ typedef struct siginfo {
112#ifdef __ARCH_SI_TRAPNO 113#ifdef __ARCH_SI_TRAPNO
113#define si_trapno _sifields._sigfault._trapno 114#define si_trapno _sifields._sigfault._trapno
114#endif 115#endif
116#define si_addr_lsb _sifields._sigfault._addr_lsb
115#define si_band _sifields._sigpoll._band 117#define si_band _sifields._sigpoll._band
116#define si_fd _sifields._sigpoll._fd 118#define si_fd _sifields._sigpoll._fd
117 119
@@ -192,7 +194,11 @@ typedef struct siginfo {
192#define BUS_ADRALN (__SI_FAULT|1) /* invalid address alignment */ 194#define BUS_ADRALN (__SI_FAULT|1) /* invalid address alignment */
193#define BUS_ADRERR (__SI_FAULT|2) /* non-existant physical address */ 195#define BUS_ADRERR (__SI_FAULT|2) /* non-existant physical address */
194#define BUS_OBJERR (__SI_FAULT|3) /* object specific hardware error */ 196#define BUS_OBJERR (__SI_FAULT|3) /* object specific hardware error */
195#define NSIGBUS 3 197/* hardware memory error consumed on a machine check: action required */
198#define BUS_MCEERR_AR (__SI_FAULT|4)
199/* hardware memory error detected in process but not consumed: action optional*/
200#define BUS_MCEERR_AO (__SI_FAULT|5)
201#define NSIGBUS 5
196 202
197/* 203/*
198 * SIGTRAP si_codes 204 * SIGTRAP si_codes
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 33ed6644abd0..78e95b8b66d4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -595,6 +595,7 @@ struct address_space_operations {
595 int (*launder_page) (struct page *); 595 int (*launder_page) (struct page *);
596 int (*is_partially_uptodate) (struct page *, read_descriptor_t *, 596 int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
597 unsigned long); 597 unsigned long);
598 int (*error_remove_page)(struct address_space *, struct page *);
598}; 599};
599 600
600/* 601/*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 87218ae84e36..6953a5a53e44 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -695,11 +695,12 @@ static inline int page_mapped(struct page *page)
695#define VM_FAULT_SIGBUS 0x0002 695#define VM_FAULT_SIGBUS 0x0002
696#define VM_FAULT_MAJOR 0x0004 696#define VM_FAULT_MAJOR 0x0004
697#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ 697#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
698#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */
698 699
699#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ 700#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
700#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ 701#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
701 702
702#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS) 703#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
703 704
704/* 705/*
705 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. 706 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
@@ -794,6 +795,11 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
794extern int vmtruncate(struct inode * inode, loff_t offset); 795extern int vmtruncate(struct inode * inode, loff_t offset);
795extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end); 796extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
796 797
798int truncate_inode_page(struct address_space *mapping, struct page *page);
799int generic_error_remove_page(struct address_space *mapping, struct page *page);
800
801int invalidate_inode_page(struct page *page);
802
797#ifdef CONFIG_MMU 803#ifdef CONFIG_MMU
798extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 804extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
799 unsigned long address, unsigned int flags); 805 unsigned long address, unsigned int flags);
@@ -1308,5 +1314,12 @@ void vmemmap_populate_print_last(void);
1308extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, 1314extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
1309 size_t size); 1315 size_t size);
1310extern void refund_locked_memory(struct mm_struct *mm, size_t size); 1316extern void refund_locked_memory(struct mm_struct *mm, size_t size);
1317
1318extern void memory_failure(unsigned long pfn, int trapno);
1319extern int __memory_failure(unsigned long pfn, int trapno, int ref);
1320extern int sysctl_memory_failure_early_kill;
1321extern int sysctl_memory_failure_recovery;
1322extern atomic_long_t mce_bad_pages;
1323
1311#endif /* __KERNEL__ */ 1324#endif /* __KERNEL__ */
1312#endif /* _LINUX_MM_H */ 1325#endif /* _LINUX_MM_H */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 13de789f0a5c..6b202b173955 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -51,6 +51,9 @@
51 * PG_buddy is set to indicate that the page is free and in the buddy system 51 * PG_buddy is set to indicate that the page is free and in the buddy system
52 * (see mm/page_alloc.c). 52 * (see mm/page_alloc.c).
53 * 53 *
54 * PG_hwpoison indicates that a page got corrupted in hardware and contains
55 * data with incorrect ECC bits that triggered a machine check. Accessing is
56 * not safe since it may cause another machine check. Don't touch!
54 */ 57 */
55 58
56/* 59/*
@@ -102,6 +105,9 @@ enum pageflags {
102#ifdef CONFIG_ARCH_USES_PG_UNCACHED 105#ifdef CONFIG_ARCH_USES_PG_UNCACHED
103 PG_uncached, /* Page has been mapped as uncached */ 106 PG_uncached, /* Page has been mapped as uncached */
104#endif 107#endif
108#ifdef CONFIG_MEMORY_FAILURE
109 PG_hwpoison, /* hardware poisoned page. Don't touch */
110#endif
105 __NR_PAGEFLAGS, 111 __NR_PAGEFLAGS,
106 112
107 /* Filesystems */ 113 /* Filesystems */
@@ -269,6 +275,15 @@ PAGEFLAG(Uncached, uncached)
269PAGEFLAG_FALSE(Uncached) 275PAGEFLAG_FALSE(Uncached)
270#endif 276#endif
271 277
278#ifdef CONFIG_MEMORY_FAILURE
279PAGEFLAG(HWPoison, hwpoison)
280TESTSETFLAG(HWPoison, hwpoison)
281#define __PG_HWPOISON (1UL << PG_hwpoison)
282#else
283PAGEFLAG_FALSE(HWPoison)
284#define __PG_HWPOISON 0
285#endif
286
272static inline int PageUptodate(struct page *page) 287static inline int PageUptodate(struct page *page)
273{ 288{
274 int ret = test_bit(PG_uptodate, &(page)->flags); 289 int ret = test_bit(PG_uptodate, &(page)->flags);
@@ -393,7 +408,7 @@ static inline void __ClearPageTail(struct page *page)
393 1 << PG_private | 1 << PG_private_2 | \ 408 1 << PG_private | 1 << PG_private_2 | \
394 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ 409 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \
395 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ 410 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
396 1 << PG_unevictable | __PG_MLOCKED) 411 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON)
397 412
398/* 413/*
399 * Flags checked when a page is prepped for return by the page allocator. 414 * Flags checked when a page is prepped for return by the page allocator.
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 07bff666e65b..931150566ade 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -88,4 +88,6 @@
88#define PR_TASK_PERF_EVENTS_DISABLE 31 88#define PR_TASK_PERF_EVENTS_DISABLE 31
89#define PR_TASK_PERF_EVENTS_ENABLE 32 89#define PR_TASK_PERF_EVENTS_ENABLE 32
90 90
91#define PR_MCE_KILL 33
92
91#endif /* _LINUX_PRCTL_H */ 93#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 477841d29fce..cb0ba7032609 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -81,7 +81,19 @@ static inline void page_dup_rmap(struct page *page)
81 */ 81 */
82int page_referenced(struct page *, int is_locked, 82int page_referenced(struct page *, int is_locked,
83 struct mem_cgroup *cnt, unsigned long *vm_flags); 83 struct mem_cgroup *cnt, unsigned long *vm_flags);
84int try_to_unmap(struct page *, int ignore_refs); 84enum ttu_flags {
85 TTU_UNMAP = 0, /* unmap mode */
86 TTU_MIGRATION = 1, /* migration mode */
87 TTU_MUNLOCK = 2, /* munlock mode */
88 TTU_ACTION_MASK = 0xff,
89
90 TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
91 TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
92 TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
93};
94#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
95
96int try_to_unmap(struct page *, enum ttu_flags flags);
85 97
86/* 98/*
87 * Called from mm/filemap_xip.c to unmap empty zero page 99 * Called from mm/filemap_xip.c to unmap empty zero page
@@ -108,6 +120,13 @@ int page_mkclean(struct page *);
108 */ 120 */
109int try_to_munlock(struct page *); 121int try_to_munlock(struct page *);
110 122
123/*
124 * Called by memory-failure.c to kill processes.
125 */
126struct anon_vma *page_lock_anon_vma(struct page *page);
127void page_unlock_anon_vma(struct anon_vma *anon_vma);
128int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
129
111#else /* !CONFIG_MMU */ 130#else /* !CONFIG_MMU */
112 131
113#define anon_vma_init() do {} while (0) 132#define anon_vma_init() do {} while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8a16f6d11dcd..75e6e60bf583 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1734,6 +1734,7 @@ extern cputime_t task_gtime(struct task_struct *p);
1734#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ 1734#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
1735#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ 1735#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
1736#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ 1736#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
1737#define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */
1737#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ 1738#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
1738#define PF_DUMPCORE 0x00000200 /* dumped core */ 1739#define PF_DUMPCORE 0x00000200 /* dumped core */
1739#define PF_SIGNALED 0x00000400 /* killed by a signal */ 1740#define PF_SIGNALED 0x00000400 /* killed by a signal */
@@ -1753,6 +1754,7 @@ extern cputime_t task_gtime(struct task_struct *p);
1753#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ 1754#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
1754#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ 1755#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
1755#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ 1756#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */
1757#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
1756#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ 1758#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
1757#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ 1759#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
1758#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ 1760#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 82232dbea3f7..4ec90019c1a4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -34,16 +34,38 @@ static inline int current_is_kswapd(void)
34 * the type/offset into the pte as 5/27 as well. 34 * the type/offset into the pte as 5/27 as well.
35 */ 35 */
36#define MAX_SWAPFILES_SHIFT 5 36#define MAX_SWAPFILES_SHIFT 5
37#ifndef CONFIG_MIGRATION 37
38#define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT) 38/*
39 * Use some of the swap files numbers for other purposes. This
40 * is a convenient way to hook into the VM to trigger special
41 * actions on faults.
42 */
43
44/*
45 * NUMA node memory migration support
46 */
47#ifdef CONFIG_MIGRATION
48#define SWP_MIGRATION_NUM 2
49#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM)
50#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
39#else 51#else
40/* Use last two entries for page migration swap entries */ 52#define SWP_MIGRATION_NUM 0
41#define MAX_SWAPFILES ((1 << MAX_SWAPFILES_SHIFT)-2)
42#define SWP_MIGRATION_READ MAX_SWAPFILES
43#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + 1)
44#endif 53#endif
45 54
46/* 55/*
56 * Handling of hardware poisoned pages with memory corruption.
57 */
58#ifdef CONFIG_MEMORY_FAILURE
59#define SWP_HWPOISON_NUM 1
60#define SWP_HWPOISON MAX_SWAPFILES
61#else
62#define SWP_HWPOISON_NUM 0
63#endif
64
65#define MAX_SWAPFILES \
66 ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
67
68/*
47 * Magic header for a swap area. The first part of the union is 69 * Magic header for a swap area. The first part of the union is
48 * what the swap magic looks like for the old (limited to 128MB) 70 * what the swap magic looks like for the old (limited to 128MB)
49 * swap area format, the second part of the union adds - in the 71 * swap area format, the second part of the union adds - in the
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 6ec39ab27b4b..cd42e30b7c6e 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -131,3 +131,41 @@ static inline int is_write_migration_entry(swp_entry_t entry)
131 131
132#endif 132#endif
133 133
134#ifdef CONFIG_MEMORY_FAILURE
135/*
136 * Support for hardware poisoned pages
137 */
138static inline swp_entry_t make_hwpoison_entry(struct page *page)
139{
140 BUG_ON(!PageLocked(page));
141 return swp_entry(SWP_HWPOISON, page_to_pfn(page));
142}
143
144static inline int is_hwpoison_entry(swp_entry_t entry)
145{
146 return swp_type(entry) == SWP_HWPOISON;
147}
148#else
149
150static inline swp_entry_t make_hwpoison_entry(struct page *page)
151{
152 return swp_entry(0, 0);
153}
154
155static inline int is_hwpoison_entry(swp_entry_t swp)
156{
157 return 0;
158}
159#endif
160
161#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
162static inline int non_swap_entry(swp_entry_t entry)
163{
164 return swp_type(entry) >= MAX_SWAPFILES;
165}
166#else
167static inline int non_swap_entry(swp_entry_t entry)
168{
169 return 0;
170}
171#endif
diff --git a/kernel/sys.c b/kernel/sys.c
index ebcb15611728..255475d163e0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1542,6 +1542,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1542 current->timer_slack_ns = arg2; 1542 current->timer_slack_ns = arg2;
1543 error = 0; 1543 error = 0;
1544 break; 1544 break;
1545 case PR_MCE_KILL:
1546 if (arg4 | arg5)
1547 return -EINVAL;
1548 switch (arg2) {
1549 case 0:
1550 if (arg3 != 0)
1551 return -EINVAL;
1552 current->flags &= ~PF_MCE_PROCESS;
1553 break;
1554 case 1:
1555 current->flags |= PF_MCE_PROCESS;
1556 if (arg3 != 0)
1557 current->flags |= PF_MCE_EARLY;
1558 else
1559 current->flags &= ~PF_MCE_EARLY;
1560 break;
1561 default:
1562 return -EINVAL;
1563 }
1564 error = 0;
1565 break;
1566
1545 default: 1567 default:
1546 error = -EINVAL; 1568 error = -EINVAL;
1547 break; 1569 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a02697b7cb97..0d949c517412 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1398,6 +1398,31 @@ static struct ctl_table vm_table[] = {
1398 .mode = 0644, 1398 .mode = 0644,
1399 .proc_handler = &scan_unevictable_handler, 1399 .proc_handler = &scan_unevictable_handler,
1400 }, 1400 },
1401#ifdef CONFIG_MEMORY_FAILURE
1402 {
1403 .ctl_name = CTL_UNNUMBERED,
1404 .procname = "memory_failure_early_kill",
1405 .data = &sysctl_memory_failure_early_kill,
1406 .maxlen = sizeof(sysctl_memory_failure_early_kill),
1407 .mode = 0644,
1408 .proc_handler = &proc_dointvec_minmax,
1409 .strategy = &sysctl_intvec,
1410 .extra1 = &zero,
1411 .extra2 = &one,
1412 },
1413 {
1414 .ctl_name = CTL_UNNUMBERED,
1415 .procname = "memory_failure_recovery",
1416 .data = &sysctl_memory_failure_recovery,
1417 .maxlen = sizeof(sysctl_memory_failure_recovery),
1418 .mode = 0644,
1419 .proc_handler = &proc_dointvec_minmax,
1420 .strategy = &sysctl_intvec,
1421 .extra1 = &zero,
1422 .extra2 = &one,
1423 },
1424#endif
1425
1401/* 1426/*
1402 * NOTE: do not add new entries to this table unless you have read 1427 * NOTE: do not add new entries to this table unless you have read
1403 * Documentation/sysctl/ctl_unnumbered.txt 1428 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/mm/Kconfig b/mm/Kconfig
index 71eb0b4cce8d..247760729593 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -245,6 +245,20 @@ config DEFAULT_MMAP_MIN_ADDR
245 /proc/sys/vm/mmap_min_addr tunable. 245 /proc/sys/vm/mmap_min_addr tunable.
246 246
247 247
248config MEMORY_FAILURE
249 depends on MMU
250 depends on X86_MCE
251 bool "Enable recovery from hardware memory errors"
252 help
253 Enables code to recover from some memory failures on systems
254 with MCA recovery. This allows a system to continue running
255 even when some of its memory has uncorrected errors. This requires
256 special hardware support and typically ECC memory.
257
258config HWPOISON_INJECT
259 tristate "Poison pages injector"
260 depends on MEMORY_FAILURE && DEBUG_KERNEL
261
248config NOMMU_INITIAL_TRIM_EXCESS 262config NOMMU_INITIAL_TRIM_EXCESS
249 int "Turn on mmap() excess space trimming before booting" 263 int "Turn on mmap() excess space trimming before booting"
250 depends on !MMU 264 depends on !MMU
diff --git a/mm/Makefile b/mm/Makefile
index 88193d73cd1a..515fd793c17f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -41,5 +41,7 @@ obj-$(CONFIG_SMP) += allocpercpu.o
41endif 41endif
42obj-$(CONFIG_QUICKLIST) += quicklist.o 42obj-$(CONFIG_QUICKLIST) += quicklist.o
43obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 43obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
44obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
45obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
44obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 46obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
45obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 47obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/filemap.c b/mm/filemap.c
index bcc7372aebbc..c1fc205a92c6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -104,6 +104,10 @@
104 * 104 *
105 * ->task->proc_lock 105 * ->task->proc_lock
106 * ->dcache_lock (proc_pid_lookup) 106 * ->dcache_lock (proc_pid_lookup)
107 *
108 * (code doesn't rely on that order, so you could switch it around)
109 * ->tasklist_lock (memory_failure, collect_procs_ao)
110 * ->i_mmap_lock
107 */ 111 */
108 112
109/* 113/*
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
new file mode 100644
index 000000000000..e1d85137f086
--- /dev/null
+++ b/mm/hwpoison-inject.c
@@ -0,0 +1,41 @@
1/* Inject a hwpoison memory failure on a arbitary pfn */
2#include <linux/module.h>
3#include <linux/debugfs.h>
4#include <linux/kernel.h>
5#include <linux/mm.h>
6
7static struct dentry *hwpoison_dir, *corrupt_pfn;
8
9static int hwpoison_inject(void *data, u64 val)
10{
11 if (!capable(CAP_SYS_ADMIN))
12 return -EPERM;
13 printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val);
14 return __memory_failure(val, 18, 0);
15}
16
17DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
18
19static void pfn_inject_exit(void)
20{
21 if (hwpoison_dir)
22 debugfs_remove_recursive(hwpoison_dir);
23}
24
25static int pfn_inject_init(void)
26{
27 hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
28 if (hwpoison_dir == NULL)
29 return -ENOMEM;
30 corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
31 NULL, &hwpoison_fops);
32 if (corrupt_pfn == NULL) {
33 pfn_inject_exit();
34 return -ENOMEM;
35 }
36 return 0;
37}
38
39module_init(pfn_inject_init);
40module_exit(pfn_inject_exit);
41MODULE_LICENSE("GPL");
diff --git a/mm/madvise.c b/mm/madvise.c
index d9ae2067952e..35b1479b7c9d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -218,6 +218,32 @@ static long madvise_remove(struct vm_area_struct *vma,
218 return error; 218 return error;
219} 219}
220 220
221#ifdef CONFIG_MEMORY_FAILURE
222/*
223 * Error injection support for memory error handling.
224 */
225static int madvise_hwpoison(unsigned long start, unsigned long end)
226{
227 int ret = 0;
228
229 if (!capable(CAP_SYS_ADMIN))
230 return -EPERM;
231 for (; start < end; start += PAGE_SIZE) {
232 struct page *p;
233 int ret = get_user_pages(current, current->mm, start, 1,
234 0, 0, &p, NULL);
235 if (ret != 1)
236 return ret;
237 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
238 page_to_pfn(p), start);
239 /* Ignore return value for now */
240 __memory_failure(page_to_pfn(p), 0, 1);
241 put_page(p);
242 }
243 return ret;
244}
245#endif
246
221static long 247static long
222madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 248madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
223 unsigned long start, unsigned long end, int behavior) 249 unsigned long start, unsigned long end, int behavior)
@@ -308,6 +334,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
308 int write; 334 int write;
309 size_t len; 335 size_t len;
310 336
337#ifdef CONFIG_MEMORY_FAILURE
338 if (behavior == MADV_HWPOISON)
339 return madvise_hwpoison(start, start+len_in);
340#endif
311 if (!madvise_behavior_valid(behavior)) 341 if (!madvise_behavior_valid(behavior))
312 return error; 342 return error;
313 343
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
new file mode 100644
index 000000000000..729d4b15b645
--- /dev/null
+++ b/mm/memory-failure.c
@@ -0,0 +1,832 @@
1/*
2 * Copyright (C) 2008, 2009 Intel Corporation
3 * Authors: Andi Kleen, Fengguang Wu
4 *
5 * This software may be redistributed and/or modified under the terms of
6 * the GNU General Public License ("GPL") version 2 only as published by the
7 * Free Software Foundation.
8 *
9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache
11 * failure.
12 *
13 * Handles page cache pages in various states. The tricky part
14 * here is that we can access any page asynchronous to other VM
15 * users, because memory failures could happen anytime and anywhere,
16 * possibly violating some of their assumptions. This is why this code
17 * has to be extremely careful. Generally it tries to use normal locking
18 * rules, as in get the standard locks, even if that means the
19 * error handling takes potentially a long time.
20 *
21 * The operation to map back from RMAP chains to processes has to walk
22 * the complete process list and has non linear complexity with the number
23 * mappings. In short it can be quite slow. But since memory corruptions
24 * are rare we hope to get away with this.
25 */
26
27/*
28 * Notebook:
29 * - hugetlb needs more code
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel
32 */
33#define DEBUG 1 /* remove me in 2.6.34 */
34#include <linux/kernel.h>
35#include <linux/mm.h>
36#include <linux/page-flags.h>
37#include <linux/sched.h>
38#include <linux/rmap.h>
39#include <linux/pagemap.h>
40#include <linux/swap.h>
41#include <linux/backing-dev.h>
42#include "internal.h"
43
44int sysctl_memory_failure_early_kill __read_mostly = 0;
45
46int sysctl_memory_failure_recovery __read_mostly = 1;
47
48atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
49
50/*
51 * Send all the processes who have the page mapped an ``action optional''
52 * signal.
53 */
54static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
55 unsigned long pfn)
56{
57 struct siginfo si;
58 int ret;
59
60 printk(KERN_ERR
61 "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
62 pfn, t->comm, t->pid);
63 si.si_signo = SIGBUS;
64 si.si_errno = 0;
65 si.si_code = BUS_MCEERR_AO;
66 si.si_addr = (void *)addr;
67#ifdef __ARCH_SI_TRAPNO
68 si.si_trapno = trapno;
69#endif
70 si.si_addr_lsb = PAGE_SHIFT;
71 /*
72 * Don't use force here, it's convenient if the signal
73 * can be temporarily blocked.
74 * This could cause a loop when the user sets SIGBUS
75 * to SIG_IGN, but hopefully noone will do that?
76 */
77 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
78 if (ret < 0)
79 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
80 t->comm, t->pid, ret);
81 return ret;
82}
83
84/*
85 * Kill all processes that have a poisoned page mapped and then isolate
86 * the page.
87 *
88 * General strategy:
89 * Find all processes having the page mapped and kill them.
90 * But we keep a page reference around so that the page is not
91 * actually freed yet.
92 * Then stash the page away
93 *
94 * There's no convenient way to get back to mapped processes
95 * from the VMAs. So do a brute-force search over all
96 * running processes.
97 *
98 * Remember that machine checks are not common (or rather
99 * if they are common you have other problems), so this shouldn't
100 * be a performance issue.
101 *
102 * Also there are some races possible while we get from the
103 * error detection to actually handle it.
104 */
105
106struct to_kill {
107 struct list_head nd;
108 struct task_struct *tsk;
109 unsigned long addr;
110 unsigned addr_valid:1;
111};
112
113/*
114 * Failure handling: if we can't find or can't kill a process there's
115 * not much we can do. We just print a message and ignore otherwise.
116 */
117
118/*
119 * Schedule a process for later kill.
120 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
121 * TBD would GFP_NOIO be enough?
122 */
123static void add_to_kill(struct task_struct *tsk, struct page *p,
124 struct vm_area_struct *vma,
125 struct list_head *to_kill,
126 struct to_kill **tkc)
127{
128 struct to_kill *tk;
129
130 if (*tkc) {
131 tk = *tkc;
132 *tkc = NULL;
133 } else {
134 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
135 if (!tk) {
136 printk(KERN_ERR
137 "MCE: Out of memory while machine check handling\n");
138 return;
139 }
140 }
141 tk->addr = page_address_in_vma(p, vma);
142 tk->addr_valid = 1;
143
144 /*
145 * In theory we don't have to kill when the page was
146 * munmaped. But it could be also a mremap. Since that's
147 * likely very rare kill anyways just out of paranoia, but use
148 * a SIGKILL because the error is not contained anymore.
149 */
150 if (tk->addr == -EFAULT) {
151 pr_debug("MCE: Unable to find user space address %lx in %s\n",
152 page_to_pfn(p), tsk->comm);
153 tk->addr_valid = 0;
154 }
155 get_task_struct(tsk);
156 tk->tsk = tsk;
157 list_add_tail(&tk->nd, to_kill);
158}
159
160/*
161 * Kill the processes that have been collected earlier.
162 *
163 * Only do anything when DOIT is set, otherwise just free the list
164 * (this is used for clean pages which do not need killing)
165 * Also when FAIL is set do a force kill because something went
166 * wrong earlier.
167 */
168static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
169 int fail, unsigned long pfn)
170{
171 struct to_kill *tk, *next;
172
173 list_for_each_entry_safe (tk, next, to_kill, nd) {
174 if (doit) {
175 /*
176 * In case something went wrong with munmaping
177 * make sure the process doesn't catch the
178 * signal and then access the memory. Just kill it.
179 * the signal handlers
180 */
181 if (fail || tk->addr_valid == 0) {
182 printk(KERN_ERR
183 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
184 pfn, tk->tsk->comm, tk->tsk->pid);
185 force_sig(SIGKILL, tk->tsk);
186 }
187
188 /*
189 * In theory the process could have mapped
190 * something else on the address in-between. We could
191 * check for that, but we need to tell the
192 * process anyways.
193 */
194 else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
195 pfn) < 0)
196 printk(KERN_ERR
197 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
198 pfn, tk->tsk->comm, tk->tsk->pid);
199 }
200 put_task_struct(tk->tsk);
201 kfree(tk);
202 }
203}
204
205static int task_early_kill(struct task_struct *tsk)
206{
207 if (!tsk->mm)
208 return 0;
209 if (tsk->flags & PF_MCE_PROCESS)
210 return !!(tsk->flags & PF_MCE_EARLY);
211 return sysctl_memory_failure_early_kill;
212}
213
214/*
215 * Collect processes when the error hit an anonymous page.
216 */
217static void collect_procs_anon(struct page *page, struct list_head *to_kill,
218 struct to_kill **tkc)
219{
220 struct vm_area_struct *vma;
221 struct task_struct *tsk;
222 struct anon_vma *av;
223
224 read_lock(&tasklist_lock);
225 av = page_lock_anon_vma(page);
226 if (av == NULL) /* Not actually mapped anymore */
227 goto out;
228 for_each_process (tsk) {
229 if (!task_early_kill(tsk))
230 continue;
231 list_for_each_entry (vma, &av->head, anon_vma_node) {
232 if (!page_mapped_in_vma(page, vma))
233 continue;
234 if (vma->vm_mm == tsk->mm)
235 add_to_kill(tsk, page, vma, to_kill, tkc);
236 }
237 }
238 page_unlock_anon_vma(av);
239out:
240 read_unlock(&tasklist_lock);
241}
242
243/*
244 * Collect processes when the error hit a file mapped page.
245 */
246static void collect_procs_file(struct page *page, struct list_head *to_kill,
247 struct to_kill **tkc)
248{
249 struct vm_area_struct *vma;
250 struct task_struct *tsk;
251 struct prio_tree_iter iter;
252 struct address_space *mapping = page->mapping;
253
254 /*
255 * A note on the locking order between the two locks.
256 * We don't rely on this particular order.
257 * If you have some other code that needs a different order
258 * feel free to switch them around. Or add a reverse link
259 * from mm_struct to task_struct, then this could be all
260 * done without taking tasklist_lock and looping over all tasks.
261 */
262
263 read_lock(&tasklist_lock);
264 spin_lock(&mapping->i_mmap_lock);
265 for_each_process(tsk) {
266 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
267
268 if (!task_early_kill(tsk))
269 continue;
270
271 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
272 pgoff) {
273 /*
274 * Send early kill signal to tasks where a vma covers
275 * the page but the corrupted page is not necessarily
276 * mapped it in its pte.
277 * Assume applications who requested early kill want
278 * to be informed of all such data corruptions.
279 */
280 if (vma->vm_mm == tsk->mm)
281 add_to_kill(tsk, page, vma, to_kill, tkc);
282 }
283 }
284 spin_unlock(&mapping->i_mmap_lock);
285 read_unlock(&tasklist_lock);
286}
287
288/*
289 * Collect the processes who have the corrupted page mapped to kill.
290 * This is done in two steps for locking reasons.
291 * First preallocate one tokill structure outside the spin locks,
292 * so that we can kill at least one process reasonably reliable.
293 */
294static void collect_procs(struct page *page, struct list_head *tokill)
295{
296 struct to_kill *tk;
297
298 if (!page->mapping)
299 return;
300
301 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
302 if (!tk)
303 return;
304 if (PageAnon(page))
305 collect_procs_anon(page, tokill, &tk);
306 else
307 collect_procs_file(page, tokill, &tk);
308 kfree(tk);
309}
310
311/*
312 * Error handlers for various types of pages.
313 */
314
315enum outcome {
316 FAILED, /* Error handling failed */
317 DELAYED, /* Will be handled later */
318 IGNORED, /* Error safely ignored */
319 RECOVERED, /* Successfully recovered */
320};
321
322static const char *action_name[] = {
323 [FAILED] = "Failed",
324 [DELAYED] = "Delayed",
325 [IGNORED] = "Ignored",
326 [RECOVERED] = "Recovered",
327};
328
329/*
330 * Error hit kernel page.
331 * Do nothing, try to be lucky and not touch this instead. For a few cases we
332 * could be more sophisticated.
333 */
334static int me_kernel(struct page *p, unsigned long pfn)
335{
336 return DELAYED;
337}
338
339/*
340 * Already poisoned page.
341 */
342static int me_ignore(struct page *p, unsigned long pfn)
343{
344 return IGNORED;
345}
346
347/*
348 * Page in unknown state. Do nothing.
349 */
350static int me_unknown(struct page *p, unsigned long pfn)
351{
352 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
353 return FAILED;
354}
355
356/*
357 * Free memory
358 */
359static int me_free(struct page *p, unsigned long pfn)
360{
361 return DELAYED;
362}
363
364/*
365 * Clean (or cleaned) page cache page.
366 */
367static int me_pagecache_clean(struct page *p, unsigned long pfn)
368{
369 int err;
370 int ret = FAILED;
371 struct address_space *mapping;
372
373 if (!isolate_lru_page(p))
374 page_cache_release(p);
375
376 /*
377 * For anonymous pages we're done the only reference left
378 * should be the one m_f() holds.
379 */
380 if (PageAnon(p))
381 return RECOVERED;
382
383 /*
384 * Now truncate the page in the page cache. This is really
385 * more like a "temporary hole punch"
386 * Don't do this for block devices when someone else
387 * has a reference, because it could be file system metadata
388 * and that's not safe to truncate.
389 */
390 mapping = page_mapping(p);
391 if (!mapping) {
392 /*
393 * Page has been teared down in the meanwhile
394 */
395 return FAILED;
396 }
397
398 /*
399 * Truncation is a bit tricky. Enable it per file system for now.
400 *
401 * Open: to take i_mutex or not for this? Right now we don't.
402 */
403 if (mapping->a_ops->error_remove_page) {
404 err = mapping->a_ops->error_remove_page(mapping, p);
405 if (err != 0) {
406 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
407 pfn, err);
408 } else if (page_has_private(p) &&
409 !try_to_release_page(p, GFP_NOIO)) {
410 pr_debug("MCE %#lx: failed to release buffers\n", pfn);
411 } else {
412 ret = RECOVERED;
413 }
414 } else {
415 /*
416 * If the file system doesn't support it just invalidate
417 * This fails on dirty or anything with private pages
418 */
419 if (invalidate_inode_page(p))
420 ret = RECOVERED;
421 else
422 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
423 pfn);
424 }
425 return ret;
426}
427
428/*
429 * Dirty cache page page
430 * Issues: when the error hit a hole page the error is not properly
431 * propagated.
432 */
433static int me_pagecache_dirty(struct page *p, unsigned long pfn)
434{
435 struct address_space *mapping = page_mapping(p);
436
437 SetPageError(p);
438 /* TBD: print more information about the file. */
439 if (mapping) {
440 /*
441 * IO error will be reported by write(), fsync(), etc.
442 * who check the mapping.
443 * This way the application knows that something went
444 * wrong with its dirty file data.
445 *
446 * There's one open issue:
447 *
448 * The EIO will be only reported on the next IO
449 * operation and then cleared through the IO map.
450 * Normally Linux has two mechanisms to pass IO error
451 * first through the AS_EIO flag in the address space
452 * and then through the PageError flag in the page.
453 * Since we drop pages on memory failure handling the
454 * only mechanism open to use is through AS_AIO.
455 *
456 * This has the disadvantage that it gets cleared on
457 * the first operation that returns an error, while
458 * the PageError bit is more sticky and only cleared
459 * when the page is reread or dropped. If an
460 * application assumes it will always get error on
461 * fsync, but does other operations on the fd before
462 * and the page is dropped inbetween then the error
463 * will not be properly reported.
464 *
465 * This can already happen even without hwpoisoned
466 * pages: first on metadata IO errors (which only
467 * report through AS_EIO) or when the page is dropped
468 * at the wrong time.
469 *
470 * So right now we assume that the application DTRT on
471 * the first EIO, but we're not worse than other parts
472 * of the kernel.
473 */
474 mapping_set_error(mapping, EIO);
475 }
476
477 return me_pagecache_clean(p, pfn);
478}
479
480/*
481 * Clean and dirty swap cache.
482 *
483 * Dirty swap cache page is tricky to handle. The page could live both in page
484 * cache and swap cache(ie. page is freshly swapped in). So it could be
485 * referenced concurrently by 2 types of PTEs:
486 * normal PTEs and swap PTEs. We try to handle them consistently by calling
487 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
488 * and then
489 * - clear dirty bit to prevent IO
490 * - remove from LRU
491 * - but keep in the swap cache, so that when we return to it on
492 * a later page fault, we know the application is accessing
493 * corrupted data and shall be killed (we installed simple
494 * interception code in do_swap_page to catch it).
495 *
496 * Clean swap cache pages can be directly isolated. A later page fault will
497 * bring in the known good data from disk.
498 */
499static int me_swapcache_dirty(struct page *p, unsigned long pfn)
500{
501 int ret = FAILED;
502
503 ClearPageDirty(p);
504 /* Trigger EIO in shmem: */
505 ClearPageUptodate(p);
506
507 if (!isolate_lru_page(p)) {
508 page_cache_release(p);
509 ret = DELAYED;
510 }
511
512 return ret;
513}
514
515static int me_swapcache_clean(struct page *p, unsigned long pfn)
516{
517 int ret = FAILED;
518
519 if (!isolate_lru_page(p)) {
520 page_cache_release(p);
521 ret = RECOVERED;
522 }
523 delete_from_swap_cache(p);
524 return ret;
525}
526
527/*
528 * Huge pages. Needs work.
529 * Issues:
530 * No rmap support so we cannot find the original mapper. In theory could walk
531 * all MMs and look for the mappings, but that would be non atomic and racy.
532 * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
533 * like just walking the current process and hoping it has it mapped (that
534 * should be usually true for the common "shared database cache" case)
535 * Should handle free huge pages and dequeue them too, but this needs to
536 * handle huge page accounting correctly.
537 */
538static int me_huge_page(struct page *p, unsigned long pfn)
539{
540 return FAILED;
541}
542
543/*
544 * Various page states we can handle.
545 *
546 * A page state is defined by its current page->flags bits.
547 * The table matches them in order and calls the right handler.
548 *
549 * This is quite tricky because we can access page at any time
550 * in its live cycle, so all accesses have to be extremly careful.
551 *
552 * This is not complete. More states could be added.
553 * For any missing state don't attempt recovery.
554 */
555
556#define dirty (1UL << PG_dirty)
557#define sc (1UL << PG_swapcache)
558#define unevict (1UL << PG_unevictable)
559#define mlock (1UL << PG_mlocked)
560#define writeback (1UL << PG_writeback)
561#define lru (1UL << PG_lru)
562#define swapbacked (1UL << PG_swapbacked)
563#define head (1UL << PG_head)
564#define tail (1UL << PG_tail)
565#define compound (1UL << PG_compound)
566#define slab (1UL << PG_slab)
567#define buddy (1UL << PG_buddy)
568#define reserved (1UL << PG_reserved)
569
570static struct page_state {
571 unsigned long mask;
572 unsigned long res;
573 char *msg;
574 int (*action)(struct page *p, unsigned long pfn);
575} error_states[] = {
576 { reserved, reserved, "reserved kernel", me_ignore },
577 { buddy, buddy, "free kernel", me_free },
578
579 /*
580 * Could in theory check if slab page is free or if we can drop
581 * currently unused objects without touching them. But just
582 * treat it as standard kernel for now.
583 */
584 { slab, slab, "kernel slab", me_kernel },
585
586#ifdef CONFIG_PAGEFLAGS_EXTENDED
587 { head, head, "huge", me_huge_page },
588 { tail, tail, "huge", me_huge_page },
589#else
590 { compound, compound, "huge", me_huge_page },
591#endif
592
593 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
594 { sc|dirty, sc, "swapcache", me_swapcache_clean },
595
596 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
597 { unevict, unevict, "unevictable LRU", me_pagecache_clean},
598
599#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
600 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
601 { mlock, mlock, "mlocked LRU", me_pagecache_clean },
602#endif
603
604 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
605 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
606 { swapbacked, swapbacked, "anonymous", me_pagecache_clean },
607
608 /*
609 * Catchall entry: must be at end.
610 */
611 { 0, 0, "unknown page state", me_unknown },
612};
613
614#undef lru
615
616static void action_result(unsigned long pfn, char *msg, int result)
617{
618 struct page *page = NULL;
619 if (pfn_valid(pfn))
620 page = pfn_to_page(pfn);
621
622 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
623 pfn,
624 page && PageDirty(page) ? "dirty " : "",
625 msg, action_name[result]);
626}
627
628static int page_action(struct page_state *ps, struct page *p,
629 unsigned long pfn, int ref)
630{
631 int result;
632
633 result = ps->action(p, pfn);
634 action_result(pfn, ps->msg, result);
635 if (page_count(p) != 1 + ref)
636 printk(KERN_ERR
637 "MCE %#lx: %s page still referenced by %d users\n",
638 pfn, ps->msg, page_count(p) - 1);
639
640 /* Could do more checks here if page looks ok */
641 /*
642 * Could adjust zone counters here to correct for the missing page.
643 */
644
645 return result == RECOVERED ? 0 : -EBUSY;
646}
647
648#define N_UNMAP_TRIES 5
649
650/*
651 * Do all that is necessary to remove user space mappings. Unmap
652 * the pages and send SIGBUS to the processes if the data was dirty.
653 */
654static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
655 int trapno)
656{
657 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
658 struct address_space *mapping;
659 LIST_HEAD(tokill);
660 int ret;
661 int i;
662 int kill = 1;
663
664 if (PageReserved(p) || PageCompound(p) || PageSlab(p))
665 return;
666
667 if (!PageLRU(p))
668 lru_add_drain_all();
669
670 /*
671 * This check implies we don't kill processes if their pages
672 * are in the swap cache early. Those are always late kills.
673 */
674 if (!page_mapped(p))
675 return;
676
677 if (PageSwapCache(p)) {
678 printk(KERN_ERR
679 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
680 ttu |= TTU_IGNORE_HWPOISON;
681 }
682
683 /*
684 * Propagate the dirty bit from PTEs to struct page first, because we
685 * need this to decide if we should kill or just drop the page.
686 */
687 mapping = page_mapping(p);
688 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
689 if (page_mkclean(p)) {
690 SetPageDirty(p);
691 } else {
692 kill = 0;
693 ttu |= TTU_IGNORE_HWPOISON;
694 printk(KERN_INFO
695 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
696 pfn);
697 }
698 }
699
700 /*
701 * First collect all the processes that have the page
702 * mapped in dirty form. This has to be done before try_to_unmap,
703 * because ttu takes the rmap data structures down.
704 *
705 * Error handling: We ignore errors here because
706 * there's nothing that can be done.
707 */
708 if (kill)
709 collect_procs(p, &tokill);
710
711 /*
712 * try_to_unmap can fail temporarily due to races.
713 * Try a few times (RED-PEN better strategy?)
714 */
715 for (i = 0; i < N_UNMAP_TRIES; i++) {
716 ret = try_to_unmap(p, ttu);
717 if (ret == SWAP_SUCCESS)
718 break;
719 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
720 }
721
722 if (ret != SWAP_SUCCESS)
723 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
724 pfn, page_mapcount(p));
725
726 /*
727 * Now that the dirty bit has been propagated to the
728 * struct page and all unmaps done we can decide if
729 * killing is needed or not. Only kill when the page
730 * was dirty, otherwise the tokill list is merely
731 * freed. When there was a problem unmapping earlier
732 * use a more force-full uncatchable kill to prevent
733 * any accesses to the poisoned memory.
734 */
735 kill_procs_ao(&tokill, !!PageDirty(p), trapno,
736 ret != SWAP_SUCCESS, pfn);
737}
738
739int __memory_failure(unsigned long pfn, int trapno, int ref)
740{
741 struct page_state *ps;
742 struct page *p;
743 int res;
744
745 if (!sysctl_memory_failure_recovery)
746 panic("Memory failure from trap %d on page %lx", trapno, pfn);
747
748 if (!pfn_valid(pfn)) {
749 action_result(pfn, "memory outside kernel control", IGNORED);
750 return -EIO;
751 }
752
753 p = pfn_to_page(pfn);
754 if (TestSetPageHWPoison(p)) {
755 action_result(pfn, "already hardware poisoned", IGNORED);
756 return 0;
757 }
758
759 atomic_long_add(1, &mce_bad_pages);
760
761 /*
762 * We need/can do nothing about count=0 pages.
763 * 1) it's a free page, and therefore in safe hand:
764 * prep_new_page() will be the gate keeper.
765 * 2) it's part of a non-compound high order page.
766 * Implies some kernel user: cannot stop them from
767 * R/W the page; let's pray that the page has been
768 * used and will be freed some time later.
769 * In fact it's dangerous to directly bump up page count from 0,
770 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
771 */
772 if (!get_page_unless_zero(compound_head(p))) {
773 action_result(pfn, "free or high order kernel", IGNORED);
774 return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
775 }
776
777 /*
778 * Lock the page and wait for writeback to finish.
779 * It's very difficult to mess with pages currently under IO
780 * and in many cases impossible, so we just avoid it here.
781 */
782 lock_page_nosync(p);
783 wait_on_page_writeback(p);
784
785 /*
786 * Now take care of user space mappings.
787 */
788 hwpoison_user_mappings(p, pfn, trapno);
789
790 /*
791 * Torn down by someone else?
792 */
793 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
794 action_result(pfn, "already truncated LRU", IGNORED);
795 res = 0;
796 goto out;
797 }
798
799 res = -EBUSY;
800 for (ps = error_states;; ps++) {
801 if ((p->flags & ps->mask) == ps->res) {
802 res = page_action(ps, p, pfn, ref);
803 break;
804 }
805 }
806out:
807 unlock_page(p);
808 return res;
809}
810EXPORT_SYMBOL_GPL(__memory_failure);
811
812/**
813 * memory_failure - Handle memory failure of a page.
814 * @pfn: Page Number of the corrupted page
815 * @trapno: Trap number reported in the signal to user space.
816 *
817 * This function is called by the low level machine check code
818 * of an architecture when it detects hardware memory corruption
819 * of a page. It tries its best to recover, which includes
820 * dropping pages, killing processes etc.
821 *
822 * The function is primarily of use for corruptions that
823 * happen outside the current execution context (e.g. when
824 * detected by a background scrubber)
825 *
826 * Must run in process context (e.g. a work queue) with interrupts
827 * enabled and no spinlocks hold.
828 */
829void memory_failure(unsigned long pfn, int trapno)
830{
831 __memory_failure(pfn, trapno, 0);
832}
diff --git a/mm/memory.c b/mm/memory.c
index b1443ac07c00..987389a809e7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1325,7 +1325,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1325 if (ret & VM_FAULT_ERROR) { 1325 if (ret & VM_FAULT_ERROR) {
1326 if (ret & VM_FAULT_OOM) 1326 if (ret & VM_FAULT_OOM)
1327 return i ? i : -ENOMEM; 1327 return i ? i : -ENOMEM;
1328 else if (ret & VM_FAULT_SIGBUS) 1328 if (ret &
1329 (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
1329 return i ? i : -EFAULT; 1330 return i ? i : -EFAULT;
1330 BUG(); 1331 BUG();
1331 } 1332 }
@@ -2559,8 +2560,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2559 goto out; 2560 goto out;
2560 2561
2561 entry = pte_to_swp_entry(orig_pte); 2562 entry = pte_to_swp_entry(orig_pte);
2562 if (is_migration_entry(entry)) { 2563 if (unlikely(non_swap_entry(entry))) {
2563 migration_entry_wait(mm, pmd, address); 2564 if (is_migration_entry(entry)) {
2565 migration_entry_wait(mm, pmd, address);
2566 } else if (is_hwpoison_entry(entry)) {
2567 ret = VM_FAULT_HWPOISON;
2568 } else {
2569 print_bad_pte(vma, address, orig_pte, NULL);
2570 ret = VM_FAULT_OOM;
2571 }
2564 goto out; 2572 goto out;
2565 } 2573 }
2566 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2574 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
@@ -2584,6 +2592,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2584 /* Had to read the page from swap area: Major fault */ 2592 /* Had to read the page from swap area: Major fault */
2585 ret = VM_FAULT_MAJOR; 2593 ret = VM_FAULT_MAJOR;
2586 count_vm_event(PGMAJFAULT); 2594 count_vm_event(PGMAJFAULT);
2595 } else if (PageHWPoison(page)) {
2596 ret = VM_FAULT_HWPOISON;
2597 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2598 goto out;
2587 } 2599 }
2588 2600
2589 lock_page(page); 2601 lock_page(page);
@@ -2760,6 +2772,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2760 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2772 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2761 return ret; 2773 return ret;
2762 2774
2775 if (unlikely(PageHWPoison(vmf.page))) {
2776 if (ret & VM_FAULT_LOCKED)
2777 unlock_page(vmf.page);
2778 return VM_FAULT_HWPOISON;
2779 }
2780
2763 /* 2781 /*
2764 * For consistency in subsequent calls, make the faulted page always 2782 * For consistency in subsequent calls, make the faulted page always
2765 * locked. 2783 * locked.
diff --git a/mm/migrate.c b/mm/migrate.c
index 16052e80aaac..1a4bf4813780 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -675,7 +675,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
675 } 675 }
676 676
677 /* Establish migration ptes or remove ptes */ 677 /* Establish migration ptes or remove ptes */
678 try_to_unmap(page, 1); 678 try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
679 679
680skip_unmap: 680skip_unmap:
681 if (!page_mapped(page)) 681 if (!page_mapped(page))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index be197f71b096..d99664e8607e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1149,6 +1149,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
1149EXPORT_SYMBOL(redirty_page_for_writepage); 1149EXPORT_SYMBOL(redirty_page_for_writepage);
1150 1150
1151/* 1151/*
1152 * Dirty a page.
1153 *
1154 * For pages with a mapping this should be done under the page lock
1155 * for the benefit of asynchronous memory errors who prefer a consistent
1156 * dirty state. This rule can be broken in some special cases,
1157 * but should be better not to.
1158 *
1152 * If the mapping doesn't provide a set_page_dirty a_op, then 1159 * If the mapping doesn't provide a set_page_dirty a_op, then
1153 * just fall through and assume that it wants buffer_heads. 1160 * just fall through and assume that it wants buffer_heads.
1154 */ 1161 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 88248b3c20bb..bf720550b44d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -234,6 +234,12 @@ static void bad_page(struct page *page)
234 static unsigned long nr_shown; 234 static unsigned long nr_shown;
235 static unsigned long nr_unshown; 235 static unsigned long nr_unshown;
236 236
237 /* Don't complain about poisoned pages */
238 if (PageHWPoison(page)) {
239 __ClearPageBuddy(page);
240 return;
241 }
242
237 /* 243 /*
238 * Allow a burst of 60 reports, then keep quiet for that minute; 244 * Allow a burst of 60 reports, then keep quiet for that minute;
239 * or allow a steady drip of one report per second. 245 * or allow a steady drip of one report per second.
@@ -666,7 +672,7 @@ static inline void expand(struct zone *zone, struct page *page,
666/* 672/*
667 * This page is about to be returned from the page allocator 673 * This page is about to be returned from the page allocator
668 */ 674 */
669static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 675static inline int check_new_page(struct page *page)
670{ 676{
671 if (unlikely(page_mapcount(page) | 677 if (unlikely(page_mapcount(page) |
672 (page->mapping != NULL) | 678 (page->mapping != NULL) |
@@ -675,6 +681,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
675 bad_page(page); 681 bad_page(page);
676 return 1; 682 return 1;
677 } 683 }
684 return 0;
685}
686
687static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
688{
689 int i;
690
691 for (i = 0; i < (1 << order); i++) {
692 struct page *p = page + i;
693 if (unlikely(check_new_page(p)))
694 return 1;
695 }
678 696
679 set_page_private(page, 0); 697 set_page_private(page, 0);
680 set_page_refcounted(page); 698 set_page_refcounted(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 720fc03a7bc4..28aafe2b5306 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,6 +36,11 @@
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 38 * within inode_lock in __sync_single_inode)
39 *
40 * (code doesn't rely on that order so it could be switched around)
41 * ->tasklist_lock
42 * anon_vma->lock (memory_failure, collect_procs_anon)
43 * pte map lock
39 */ 44 */
40 45
41#include <linux/mm.h> 46#include <linux/mm.h>
@@ -191,7 +196,7 @@ void __init anon_vma_init(void)
191 * Getting a lock on a stable anon_vma from a page off the LRU is 196 * Getting a lock on a stable anon_vma from a page off the LRU is
192 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 197 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
193 */ 198 */
194static struct anon_vma *page_lock_anon_vma(struct page *page) 199struct anon_vma *page_lock_anon_vma(struct page *page)
195{ 200{
196 struct anon_vma *anon_vma; 201 struct anon_vma *anon_vma;
197 unsigned long anon_mapping; 202 unsigned long anon_mapping;
@@ -211,7 +216,7 @@ out:
211 return NULL; 216 return NULL;
212} 217}
213 218
214static void page_unlock_anon_vma(struct anon_vma *anon_vma) 219void page_unlock_anon_vma(struct anon_vma *anon_vma)
215{ 220{
216 spin_unlock(&anon_vma->lock); 221 spin_unlock(&anon_vma->lock);
217 rcu_read_unlock(); 222 rcu_read_unlock();
@@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
311 * if the page is not mapped into the page tables of this VMA. Only 316 * if the page is not mapped into the page tables of this VMA. Only
312 * valid for normal file or anonymous VMAs. 317 * valid for normal file or anonymous VMAs.
313 */ 318 */
314static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) 319int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
315{ 320{
316 unsigned long address; 321 unsigned long address;
317 pte_t *pte; 322 pte_t *pte;
@@ -756,7 +761,7 @@ void page_remove_rmap(struct page *page)
756 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 761 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
757 */ 762 */
758static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 763static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
759 int migration) 764 enum ttu_flags flags)
760{ 765{
761 struct mm_struct *mm = vma->vm_mm; 766 struct mm_struct *mm = vma->vm_mm;
762 unsigned long address; 767 unsigned long address;
@@ -778,11 +783,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
778 * If it's recently referenced (perhaps page_referenced 783 * If it's recently referenced (perhaps page_referenced
779 * skipped over this mm) then we should reactivate it. 784 * skipped over this mm) then we should reactivate it.
780 */ 785 */
781 if (!migration) { 786 if (!(flags & TTU_IGNORE_MLOCK)) {
782 if (vma->vm_flags & VM_LOCKED) { 787 if (vma->vm_flags & VM_LOCKED) {
783 ret = SWAP_MLOCK; 788 ret = SWAP_MLOCK;
784 goto out_unmap; 789 goto out_unmap;
785 } 790 }
791 }
792 if (!(flags & TTU_IGNORE_ACCESS)) {
786 if (ptep_clear_flush_young_notify(vma, address, pte)) { 793 if (ptep_clear_flush_young_notify(vma, address, pte)) {
787 ret = SWAP_FAIL; 794 ret = SWAP_FAIL;
788 goto out_unmap; 795 goto out_unmap;
@@ -800,7 +807,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
800 /* Update high watermark before we lower rss */ 807 /* Update high watermark before we lower rss */
801 update_hiwater_rss(mm); 808 update_hiwater_rss(mm);
802 809
803 if (PageAnon(page)) { 810 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
811 if (PageAnon(page))
812 dec_mm_counter(mm, anon_rss);
813 else
814 dec_mm_counter(mm, file_rss);
815 set_pte_at(mm, address, pte,
816 swp_entry_to_pte(make_hwpoison_entry(page)));
817 } else if (PageAnon(page)) {
804 swp_entry_t entry = { .val = page_private(page) }; 818 swp_entry_t entry = { .val = page_private(page) };
805 819
806 if (PageSwapCache(page)) { 820 if (PageSwapCache(page)) {
@@ -822,12 +836,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
822 * pte. do_swap_page() will wait until the migration 836 * pte. do_swap_page() will wait until the migration
823 * pte is removed and then restart fault handling. 837 * pte is removed and then restart fault handling.
824 */ 838 */
825 BUG_ON(!migration); 839 BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
826 entry = make_migration_entry(page, pte_write(pteval)); 840 entry = make_migration_entry(page, pte_write(pteval));
827 } 841 }
828 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 842 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
829 BUG_ON(pte_file(*pte)); 843 BUG_ON(pte_file(*pte));
830 } else if (PAGE_MIGRATION && migration) { 844 } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
831 /* Establish migration entry for a file page */ 845 /* Establish migration entry for a file page */
832 swp_entry_t entry; 846 swp_entry_t entry;
833 entry = make_migration_entry(page, pte_write(pteval)); 847 entry = make_migration_entry(page, pte_write(pteval));
@@ -996,12 +1010,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
996 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1010 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
997 * 'LOCKED. 1011 * 'LOCKED.
998 */ 1012 */
999static int try_to_unmap_anon(struct page *page, int unlock, int migration) 1013static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1000{ 1014{
1001 struct anon_vma *anon_vma; 1015 struct anon_vma *anon_vma;
1002 struct vm_area_struct *vma; 1016 struct vm_area_struct *vma;
1003 unsigned int mlocked = 0; 1017 unsigned int mlocked = 0;
1004 int ret = SWAP_AGAIN; 1018 int ret = SWAP_AGAIN;
1019 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1005 1020
1006 if (MLOCK_PAGES && unlikely(unlock)) 1021 if (MLOCK_PAGES && unlikely(unlock))
1007 ret = SWAP_SUCCESS; /* default for try_to_munlock() */ 1022 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
@@ -1017,7 +1032,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1017 continue; /* must visit all unlocked vmas */ 1032 continue; /* must visit all unlocked vmas */
1018 ret = SWAP_MLOCK; /* saw at least one mlocked vma */ 1033 ret = SWAP_MLOCK; /* saw at least one mlocked vma */
1019 } else { 1034 } else {
1020 ret = try_to_unmap_one(page, vma, migration); 1035 ret = try_to_unmap_one(page, vma, flags);
1021 if (ret == SWAP_FAIL || !page_mapped(page)) 1036 if (ret == SWAP_FAIL || !page_mapped(page))
1022 break; 1037 break;
1023 } 1038 }
@@ -1041,8 +1056,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1041/** 1056/**
1042 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method 1057 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
1043 * @page: the page to unmap/unlock 1058 * @page: the page to unmap/unlock
1044 * @unlock: request for unlock rather than unmap [unlikely] 1059 * @flags: action and flags
1045 * @migration: unmapping for migration - ignored if @unlock
1046 * 1060 *
1047 * Find all the mappings of a page using the mapping pointer and the vma chains 1061 * Find all the mappings of a page using the mapping pointer and the vma chains
1048 * contained in the address_space struct it points to. 1062 * contained in the address_space struct it points to.
@@ -1054,7 +1068,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1054 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1068 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1055 * 'LOCKED. 1069 * 'LOCKED.
1056 */ 1070 */
1057static int try_to_unmap_file(struct page *page, int unlock, int migration) 1071static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1058{ 1072{
1059 struct address_space *mapping = page->mapping; 1073 struct address_space *mapping = page->mapping;
1060 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1074 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1066,6 +1080,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1066 unsigned long max_nl_size = 0; 1080 unsigned long max_nl_size = 0;
1067 unsigned int mapcount; 1081 unsigned int mapcount;
1068 unsigned int mlocked = 0; 1082 unsigned int mlocked = 0;
1083 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1069 1084
1070 if (MLOCK_PAGES && unlikely(unlock)) 1085 if (MLOCK_PAGES && unlikely(unlock))
1071 ret = SWAP_SUCCESS; /* default for try_to_munlock() */ 1086 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
@@ -1078,7 +1093,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1078 continue; /* must visit all vmas */ 1093 continue; /* must visit all vmas */
1079 ret = SWAP_MLOCK; 1094 ret = SWAP_MLOCK;
1080 } else { 1095 } else {
1081 ret = try_to_unmap_one(page, vma, migration); 1096 ret = try_to_unmap_one(page, vma, flags);
1082 if (ret == SWAP_FAIL || !page_mapped(page)) 1097 if (ret == SWAP_FAIL || !page_mapped(page))
1083 goto out; 1098 goto out;
1084 } 1099 }
@@ -1103,7 +1118,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1103 ret = SWAP_MLOCK; /* leave mlocked == 0 */ 1118 ret = SWAP_MLOCK; /* leave mlocked == 0 */
1104 goto out; /* no need to look further */ 1119 goto out; /* no need to look further */
1105 } 1120 }
1106 if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) 1121 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1122 (vma->vm_flags & VM_LOCKED))
1107 continue; 1123 continue;
1108 cursor = (unsigned long) vma->vm_private_data; 1124 cursor = (unsigned long) vma->vm_private_data;
1109 if (cursor > max_nl_cursor) 1125 if (cursor > max_nl_cursor)
@@ -1137,7 +1153,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1137 do { 1153 do {
1138 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1154 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1139 shared.vm_set.list) { 1155 shared.vm_set.list) {
1140 if (!MLOCK_PAGES && !migration && 1156 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1141 (vma->vm_flags & VM_LOCKED)) 1157 (vma->vm_flags & VM_LOCKED))
1142 continue; 1158 continue;
1143 cursor = (unsigned long) vma->vm_private_data; 1159 cursor = (unsigned long) vma->vm_private_data;
@@ -1177,7 +1193,7 @@ out:
1177/** 1193/**
1178 * try_to_unmap - try to remove all page table mappings to a page 1194 * try_to_unmap - try to remove all page table mappings to a page
1179 * @page: the page to get unmapped 1195 * @page: the page to get unmapped
1180 * @migration: migration flag 1196 * @flags: action and flags
1181 * 1197 *
1182 * Tries to remove all the page table entries which are mapping this 1198 * Tries to remove all the page table entries which are mapping this
1183 * page, used in the pageout path. Caller must hold the page lock. 1199 * page, used in the pageout path. Caller must hold the page lock.
@@ -1188,16 +1204,16 @@ out:
1188 * SWAP_FAIL - the page is unswappable 1204 * SWAP_FAIL - the page is unswappable
1189 * SWAP_MLOCK - page is mlocked. 1205 * SWAP_MLOCK - page is mlocked.
1190 */ 1206 */
1191int try_to_unmap(struct page *page, int migration) 1207int try_to_unmap(struct page *page, enum ttu_flags flags)
1192{ 1208{
1193 int ret; 1209 int ret;
1194 1210
1195 BUG_ON(!PageLocked(page)); 1211 BUG_ON(!PageLocked(page));
1196 1212
1197 if (PageAnon(page)) 1213 if (PageAnon(page))
1198 ret = try_to_unmap_anon(page, 0, migration); 1214 ret = try_to_unmap_anon(page, flags);
1199 else 1215 else
1200 ret = try_to_unmap_file(page, 0, migration); 1216 ret = try_to_unmap_file(page, flags);
1201 if (ret != SWAP_MLOCK && !page_mapped(page)) 1217 if (ret != SWAP_MLOCK && !page_mapped(page))
1202 ret = SWAP_SUCCESS; 1218 ret = SWAP_SUCCESS;
1203 return ret; 1219 return ret;
@@ -1222,8 +1238,8 @@ int try_to_munlock(struct page *page)
1222 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1238 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1223 1239
1224 if (PageAnon(page)) 1240 if (PageAnon(page))
1225 return try_to_unmap_anon(page, 1, 0); 1241 return try_to_unmap_anon(page, TTU_MUNLOCK);
1226 else 1242 else
1227 return try_to_unmap_file(page, 1, 0); 1243 return try_to_unmap_file(page, TTU_MUNLOCK);
1228} 1244}
1229 1245
diff --git a/mm/shmem.c b/mm/shmem.c
index b206a7a32e2a..98631c26c200 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1633,8 +1633,8 @@ shmem_write_end(struct file *file, struct address_space *mapping,
1633 if (pos + copied > inode->i_size) 1633 if (pos + copied > inode->i_size)
1634 i_size_write(inode, pos + copied); 1634 i_size_write(inode, pos + copied);
1635 1635
1636 unlock_page(page);
1637 set_page_dirty(page); 1636 set_page_dirty(page);
1637 unlock_page(page);
1638 page_cache_release(page); 1638 page_cache_release(page);
1639 1639
1640 return copied; 1640 return copied;
@@ -1971,13 +1971,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1971 iput(inode); 1971 iput(inode);
1972 return error; 1972 return error;
1973 } 1973 }
1974 unlock_page(page);
1975 inode->i_mapping->a_ops = &shmem_aops; 1974 inode->i_mapping->a_ops = &shmem_aops;
1976 inode->i_op = &shmem_symlink_inode_operations; 1975 inode->i_op = &shmem_symlink_inode_operations;
1977 kaddr = kmap_atomic(page, KM_USER0); 1976 kaddr = kmap_atomic(page, KM_USER0);
1978 memcpy(kaddr, symname, len); 1977 memcpy(kaddr, symname, len);
1979 kunmap_atomic(kaddr, KM_USER0); 1978 kunmap_atomic(kaddr, KM_USER0);
1980 set_page_dirty(page); 1979 set_page_dirty(page);
1980 unlock_page(page);
1981 page_cache_release(page); 1981 page_cache_release(page);
1982 } 1982 }
1983 if (dir->i_mode & S_ISGID) 1983 if (dir->i_mode & S_ISGID)
@@ -2420,6 +2420,7 @@ static const struct address_space_operations shmem_aops = {
2420 .write_end = shmem_write_end, 2420 .write_end = shmem_write_end,
2421#endif 2421#endif
2422 .migratepage = migrate_page, 2422 .migratepage = migrate_page,
2423 .error_remove_page = generic_error_remove_page,
2423}; 2424};
2424 2425
2425static const struct file_operations shmem_file_operations = { 2426static const struct file_operations shmem_file_operations = {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1bf19daadc6..4de7f02f820b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -699,7 +699,7 @@ int free_swap_and_cache(swp_entry_t entry)
699 struct swap_info_struct *p; 699 struct swap_info_struct *p;
700 struct page *page = NULL; 700 struct page *page = NULL;
701 701
702 if (is_migration_entry(entry)) 702 if (non_swap_entry(entry))
703 return 1; 703 return 1;
704 704
705 p = swap_info_get(entry); 705 p = swap_info_get(entry);
@@ -2085,7 +2085,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache)
2085 int count; 2085 int count;
2086 bool has_cache; 2086 bool has_cache;
2087 2087
2088 if (is_migration_entry(entry)) 2088 if (non_swap_entry(entry))
2089 return -EINVAL; 2089 return -EINVAL;
2090 2090
2091 type = swp_type(entry); 2091 type = swp_type(entry);
diff --git a/mm/truncate.c b/mm/truncate.c
index ccc3ecf7cb98..a17b3977cfdf 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page);
93 * its lock, b) when a concurrent invalidate_mapping_pages got there first and 93 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
94 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. 94 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
95 */ 95 */
96static void 96static int
97truncate_complete_page(struct address_space *mapping, struct page *page) 97truncate_complete_page(struct address_space *mapping, struct page *page)
98{ 98{
99 if (page->mapping != mapping) 99 if (page->mapping != mapping)
100 return; 100 return -EIO;
101 101
102 if (page_has_private(page)) 102 if (page_has_private(page))
103 do_invalidatepage(page, 0); 103 do_invalidatepage(page, 0);
@@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
108 remove_from_page_cache(page); 108 remove_from_page_cache(page);
109 ClearPageMappedToDisk(page); 109 ClearPageMappedToDisk(page);
110 page_cache_release(page); /* pagecache ref */ 110 page_cache_release(page); /* pagecache ref */
111 return 0;
111} 112}
112 113
113/* 114/*
@@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
135 return ret; 136 return ret;
136} 137}
137 138
139int truncate_inode_page(struct address_space *mapping, struct page *page)
140{
141 if (page_mapped(page)) {
142 unmap_mapping_range(mapping,
143 (loff_t)page->index << PAGE_CACHE_SHIFT,
144 PAGE_CACHE_SIZE, 0);
145 }
146 return truncate_complete_page(mapping, page);
147}
148
149/*
150 * Used to get rid of pages on hardware memory corruption.
151 */
152int generic_error_remove_page(struct address_space *mapping, struct page *page)
153{
154 if (!mapping)
155 return -EINVAL;
156 /*
157 * Only punch for normal data pages for now.
158 * Handling other types like directories would need more auditing.
159 */
160 if (!S_ISREG(mapping->host->i_mode))
161 return -EIO;
162 return truncate_inode_page(mapping, page);
163}
164EXPORT_SYMBOL(generic_error_remove_page);
165
166/*
167 * Safely invalidate one page from its pagecache mapping.
168 * It only drops clean, unused pages. The page must be locked.
169 *
170 * Returns 1 if the page is successfully invalidated, otherwise 0.
171 */
172int invalidate_inode_page(struct page *page)
173{
174 struct address_space *mapping = page_mapping(page);
175 if (!mapping)
176 return 0;
177 if (PageDirty(page) || PageWriteback(page))
178 return 0;
179 if (page_mapped(page))
180 return 0;
181 return invalidate_complete_page(mapping, page);
182}
183
138/** 184/**
139 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets 185 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
140 * @mapping: mapping to truncate 186 * @mapping: mapping to truncate
@@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
196 unlock_page(page); 242 unlock_page(page);
197 continue; 243 continue;
198 } 244 }
199 if (page_mapped(page)) { 245 truncate_inode_page(mapping, page);
200 unmap_mapping_range(mapping,
201 (loff_t)page_index<<PAGE_CACHE_SHIFT,
202 PAGE_CACHE_SIZE, 0);
203 }
204 truncate_complete_page(mapping, page);
205 unlock_page(page); 246 unlock_page(page);
206 } 247 }
207 pagevec_release(&pvec); 248 pagevec_release(&pvec);
@@ -238,15 +279,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
238 break; 279 break;
239 lock_page(page); 280 lock_page(page);
240 wait_on_page_writeback(page); 281 wait_on_page_writeback(page);
241 if (page_mapped(page)) { 282 truncate_inode_page(mapping, page);
242 unmap_mapping_range(mapping,
243 (loff_t)page->index<<PAGE_CACHE_SHIFT,
244 PAGE_CACHE_SIZE, 0);
245 }
246 if (page->index > next) 283 if (page->index > next)
247 next = page->index; 284 next = page->index;
248 next++; 285 next++;
249 truncate_complete_page(mapping, page);
250 unlock_page(page); 286 unlock_page(page);
251 } 287 }
252 pagevec_release(&pvec); 288 pagevec_release(&pvec);
@@ -311,12 +347,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
311 if (lock_failed) 347 if (lock_failed)
312 continue; 348 continue;
313 349
314 if (PageDirty(page) || PageWriteback(page)) 350 ret += invalidate_inode_page(page);
315 goto unlock; 351
316 if (page_mapped(page))
317 goto unlock;
318 ret += invalidate_complete_page(mapping, page);
319unlock:
320 unlock_page(page); 352 unlock_page(page);
321 if (next > end) 353 if (next > end)
322 break; 354 break;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f444b7409085..1219ceb8a9b2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -663,7 +663,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
663 * processes. Try to unmap it here. 663 * processes. Try to unmap it here.
664 */ 664 */
665 if (page_mapped(page) && mapping) { 665 if (page_mapped(page) && mapping) {
666 switch (try_to_unmap(page, 0)) { 666 switch (try_to_unmap(page, TTU_UNMAP)) {
667 case SWAP_FAIL: 667 case SWAP_FAIL:
668 goto activate_locked; 668 goto activate_locked;
669 case SWAP_AGAIN: 669 case SWAP_AGAIN: