aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-memory-page-offline44
-rw-r--r--Documentation/vm/hwpoison.txt52
-rw-r--r--Documentation/vm/page-types.c15
-rw-r--r--MAINTAINERS9
-rw-r--r--drivers/base/memory.c61
-rw-r--r--fs/proc/page.c45
-rw-r--r--include/asm-generic/mman-common.h1
-rw-r--r--include/linux/kernel-page-flags.h46
-rw-r--r--include/linux/memcontrol.h13
-rw-r--r--include/linux/mm.h8
-rw-r--r--include/linux/page-flags.h4
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/hwpoison-inject.c113
-rw-r--r--mm/internal.h12
-rw-r--r--mm/madvise.c21
-rw-r--r--mm/memcontrol.c16
-rw-r--r--mm/memory-failure.c560
-rw-r--r--mm/memory.c4
-rw-r--r--mm/page_alloc.c21
19 files changed, 922 insertions, 126 deletions
diff --git a/Documentation/ABI/testing/sysfs-memory-page-offline b/Documentation/ABI/testing/sysfs-memory-page-offline
new file mode 100644
index 000000000000..e14703f12fdf
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-memory-page-offline
@@ -0,0 +1,44 @@
1What: /sys/devices/system/memory/soft_offline_page
2Date: Sep 2009
3KernelVersion: 2.6.33
4Contact: andi@firstfloor.org
5Description:
6 Soft-offline the memory page containing the physical address
7 written into this file. Input is a hex number specifying the
8 physical address of the page. The kernel will then attempt
9 to soft-offline it, by moving the contents elsewhere or
10 dropping it if possible. The kernel will then be placed
11 on the bad page list and never be reused.
12
13 The offlining is done in kernel specific granuality.
14 Normally it's the base page size of the kernel, but
15 this might change.
16
17 The page must be still accessible, not poisoned. The
18 kernel will never kill anything for this, but rather
19 fail the offline. Return value is the size of the
20 number, or a error when the offlining failed. Reading
21 the file is not allowed.
22
23What: /sys/devices/system/memory/hard_offline_page
24Date: Sep 2009
25KernelVersion: 2.6.33
26Contact: andi@firstfloor.org
27Description:
28 Hard-offline the memory page containing the physical
29 address written into this file. Input is a hex number
30 specifying the physical address of the page. The
31 kernel will then attempt to hard-offline the page, by
32 trying to drop the page or killing any owner or
33 triggering IO errors if needed. Note this may kill
34 any processes owning the page. The kernel will avoid
35 to access this page assuming it's poisoned by the
36 hardware.
37
38 The offlining is done in kernel specific granuality.
39 Normally it's the base page size of the kernel, but
40 this might change.
41
42 Return value is the size of the number, or a error when
43 the offlining failed.
44 Reading the file is not allowed.
diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt
index 3ffadf8da61f..12f9ba20ccb7 100644
--- a/Documentation/vm/hwpoison.txt
+++ b/Documentation/vm/hwpoison.txt
@@ -92,16 +92,62 @@ PR_MCE_KILL_GET
92 92
93Testing: 93Testing:
94 94
95madvise(MADV_POISON, ....) 95madvise(MADV_HWPOISON, ....)
96 (as root) 96 (as root)
97 Poison a page in the process for testing 97 Poison a page in the process for testing
98 98
99 99
100hwpoison-inject module through debugfs 100hwpoison-inject module through debugfs
101 /sys/debug/hwpoison/corrupt-pfn
102 101
103Inject hwpoison fault at PFN echoed into this file 102/sys/debug/hwpoison/
104 103
104corrupt-pfn
105
106Inject hwpoison fault at PFN echoed into this file. This does
107some early filtering to avoid corrupted unintended pages in test suites.
108
109unpoison-pfn
110
111Software-unpoison page at PFN echoed into this file. This
112way a page can be reused again.
113This only works for Linux injected failures, not for real
114memory failures.
115
116Note these injection interfaces are not stable and might change between
117kernel versions
118
119corrupt-filter-dev-major
120corrupt-filter-dev-minor
121
122Only handle memory failures to pages associated with the file system defined
123by block device major/minor. -1U is the wildcard value.
124This should be only used for testing with artificial injection.
125
126corrupt-filter-memcg
127
128Limit injection to pages owned by memgroup. Specified by inode number
129of the memcg.
130
131Example:
132 mkdir /cgroup/hwpoison
133
134 usemem -m 100 -s 1000 &
135 echo `jobs -p` > /cgroup/hwpoison/tasks
136
137 memcg_ino=$(ls -id /cgroup/hwpoison | cut -f1 -d' ')
138 echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
139
140 page-types -p `pidof init` --hwpoison # shall do nothing
141 page-types -p `pidof usemem` --hwpoison # poison its pages
142
143corrupt-filter-flags-mask
144corrupt-filter-flags-value
145
146When specified, only poison pages if ((page_flags & mask) == value).
147This allows stress testing of many kinds of pages. The page_flags
148are the same as in /proc/kpageflags. The flag bits are defined in
149include/linux/kernel-page-flags.h and documented in
150Documentation/vm/pagemap.txt
105 151
106Architecture specific MCE injector 152Architecture specific MCE injector
107 153
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c
index 7a7d9bab32ef..66e9358e2144 100644
--- a/Documentation/vm/page-types.c
+++ b/Documentation/vm/page-types.c
@@ -1,11 +1,22 @@
1/* 1/*
2 * page-types: Tool for querying page flags 2 * page-types: Tool for querying page flags
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the Free
6 * Software Foundation; version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should find a copy of v2 of the GNU General Public License somewhere on
14 * your Linux system; if not, write to the Free Software Foundation, Inc., 59
15 * Temple Place, Suite 330, Boston, MA 02111-1307 USA.
16 *
4 * Copyright (C) 2009 Intel corporation 17 * Copyright (C) 2009 Intel corporation
5 * 18 *
6 * Authors: Wu Fengguang <fengguang.wu@intel.com> 19 * Authors: Wu Fengguang <fengguang.wu@intel.com>
7 *
8 * Released under the General Public License (GPL).
9 */ 20 */
10 21
11#define _LARGEFILE64_SOURCE 22#define _LARGEFILE64_SOURCE
diff --git a/MAINTAINERS b/MAINTAINERS
index d6a27110a747..0699782f8c5b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2377,6 +2377,15 @@ W: http://www.kernel.org/pub/linux/kernel/people/fseidel/hdaps/
2377S: Maintained 2377S: Maintained
2378F: drivers/hwmon/hdaps.c 2378F: drivers/hwmon/hdaps.c
2379 2379
2380HWPOISON MEMORY FAILURE HANDLING
2381M: Andi Kleen <andi@firstfloor.org>
2382L: linux-mm@kvack.org
2383L: linux-kernel@vger.kernel.org
2384T: git git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6.git hwpoison
2385S: Maintained
2386F: mm/memory-failure.c
2387F: mm/hwpoison-inject.c
2388
2380HYPERVISOR VIRTUAL CONSOLE DRIVER 2389HYPERVISOR VIRTUAL CONSOLE DRIVER
2381L: linuxppc-dev@ozlabs.org 2390L: linuxppc-dev@ozlabs.org
2382S: Odd Fixes 2391S: Odd Fixes
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 989429cfed88..c4c8f2e1dd15 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -341,6 +341,64 @@ static inline int memory_probe_init(void)
341} 341}
342#endif 342#endif
343 343
344#ifdef CONFIG_MEMORY_FAILURE
345/*
346 * Support for offlining pages of memory
347 */
348
349/* Soft offline a page */
350static ssize_t
351store_soft_offline_page(struct class *class, const char *buf, size_t count)
352{
353 int ret;
354 u64 pfn;
355 if (!capable(CAP_SYS_ADMIN))
356 return -EPERM;
357 if (strict_strtoull(buf, 0, &pfn) < 0)
358 return -EINVAL;
359 pfn >>= PAGE_SHIFT;
360 if (!pfn_valid(pfn))
361 return -ENXIO;
362 ret = soft_offline_page(pfn_to_page(pfn), 0);
363 return ret == 0 ? count : ret;
364}
365
366/* Forcibly offline a page, including killing processes. */
367static ssize_t
368store_hard_offline_page(struct class *class, const char *buf, size_t count)
369{
370 int ret;
371 u64 pfn;
372 if (!capable(CAP_SYS_ADMIN))
373 return -EPERM;
374 if (strict_strtoull(buf, 0, &pfn) < 0)
375 return -EINVAL;
376 pfn >>= PAGE_SHIFT;
377 ret = __memory_failure(pfn, 0, 0);
378 return ret ? ret : count;
379}
380
381static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
382static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
383
384static __init int memory_fail_init(void)
385{
386 int err;
387
388 err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
389 &class_attr_soft_offline_page.attr);
390 if (!err)
391 err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
392 &class_attr_hard_offline_page.attr);
393 return err;
394}
395#else
396static inline int memory_fail_init(void)
397{
398 return 0;
399}
400#endif
401
344/* 402/*
345 * Note that phys_device is optional. It is here to allow for 403 * Note that phys_device is optional. It is here to allow for
346 * differentiation between which *physical* devices each 404 * differentiation between which *physical* devices each
@@ -473,6 +531,9 @@ int __init memory_dev_init(void)
473 err = memory_probe_init(); 531 err = memory_probe_init();
474 if (!ret) 532 if (!ret)
475 ret = err; 533 ret = err;
534 err = memory_fail_init();
535 if (!ret)
536 ret = err;
476 err = block_size_init(); 537 err = block_size_init();
477 if (!ret) 538 if (!ret)
478 ret = err; 539 ret = err;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 5033ce0d254b..180cf5a0bd67 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -8,6 +8,7 @@
8#include <linux/proc_fs.h> 8#include <linux/proc_fs.h>
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/hugetlb.h> 10#include <linux/hugetlb.h>
11#include <linux/kernel-page-flags.h>
11#include <asm/uaccess.h> 12#include <asm/uaccess.h>
12#include "internal.h" 13#include "internal.h"
13 14
@@ -71,52 +72,12 @@ static const struct file_operations proc_kpagecount_operations = {
71 * physical page flags. 72 * physical page flags.
72 */ 73 */
73 74
74/* These macros are used to decouple internal flags from exported ones */
75
76#define KPF_LOCKED 0
77#define KPF_ERROR 1
78#define KPF_REFERENCED 2
79#define KPF_UPTODATE 3
80#define KPF_DIRTY 4
81#define KPF_LRU 5
82#define KPF_ACTIVE 6
83#define KPF_SLAB 7
84#define KPF_WRITEBACK 8
85#define KPF_RECLAIM 9
86#define KPF_BUDDY 10
87
88/* 11-20: new additions in 2.6.31 */
89#define KPF_MMAP 11
90#define KPF_ANON 12
91#define KPF_SWAPCACHE 13
92#define KPF_SWAPBACKED 14
93#define KPF_COMPOUND_HEAD 15
94#define KPF_COMPOUND_TAIL 16
95#define KPF_HUGE 17
96#define KPF_UNEVICTABLE 18
97#define KPF_HWPOISON 19
98#define KPF_NOPAGE 20
99
100#define KPF_KSM 21
101
102/* kernel hacking assistances
103 * WARNING: subject to change, never rely on them!
104 */
105#define KPF_RESERVED 32
106#define KPF_MLOCKED 33
107#define KPF_MAPPEDTODISK 34
108#define KPF_PRIVATE 35
109#define KPF_PRIVATE_2 36
110#define KPF_OWNER_PRIVATE 37
111#define KPF_ARCH 38
112#define KPF_UNCACHED 39
113
114static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) 75static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
115{ 76{
116 return ((kflags >> kbit) & 1) << ubit; 77 return ((kflags >> kbit) & 1) << ubit;
117} 78}
118 79
119static u64 get_uflags(struct page *page) 80u64 stable_page_flags(struct page *page)
120{ 81{
121 u64 k; 82 u64 k;
122 u64 u; 83 u64 u;
@@ -219,7 +180,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
219 else 180 else
220 ppage = NULL; 181 ppage = NULL;
221 182
222 if (put_user(get_uflags(ppage), out)) { 183 if (put_user(stable_page_flags(ppage), out)) {
223 ret = -EFAULT; 184 ret = -EFAULT;
224 break; 185 break;
225 } 186 }
diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h
index 20111265afd8..3da9e2742fa0 100644
--- a/include/asm-generic/mman-common.h
+++ b/include/asm-generic/mman-common.h
@@ -40,6 +40,7 @@
40#define MADV_DONTFORK 10 /* don't inherit across fork */ 40#define MADV_DONTFORK 10 /* don't inherit across fork */
41#define MADV_DOFORK 11 /* do inherit across fork */ 41#define MADV_DOFORK 11 /* do inherit across fork */
42#define MADV_HWPOISON 100 /* poison a page for testing */ 42#define MADV_HWPOISON 100 /* poison a page for testing */
43#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
43 44
44#define MADV_MERGEABLE 12 /* KSM may merge identical pages */ 45#define MADV_MERGEABLE 12 /* KSM may merge identical pages */
45#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ 46#define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */
diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h
new file mode 100644
index 000000000000..bd92a89f4b0a
--- /dev/null
+++ b/include/linux/kernel-page-flags.h
@@ -0,0 +1,46 @@
1#ifndef LINUX_KERNEL_PAGE_FLAGS_H
2#define LINUX_KERNEL_PAGE_FLAGS_H
3
4/*
5 * Stable page flag bits exported to user space
6 */
7
8#define KPF_LOCKED 0
9#define KPF_ERROR 1
10#define KPF_REFERENCED 2
11#define KPF_UPTODATE 3
12#define KPF_DIRTY 4
13#define KPF_LRU 5
14#define KPF_ACTIVE 6
15#define KPF_SLAB 7
16#define KPF_WRITEBACK 8
17#define KPF_RECLAIM 9
18#define KPF_BUDDY 10
19
20/* 11-20: new additions in 2.6.31 */
21#define KPF_MMAP 11
22#define KPF_ANON 12
23#define KPF_SWAPCACHE 13
24#define KPF_SWAPBACKED 14
25#define KPF_COMPOUND_HEAD 15
26#define KPF_COMPOUND_TAIL 16
27#define KPF_HUGE 17
28#define KPF_UNEVICTABLE 18
29#define KPF_HWPOISON 19
30#define KPF_NOPAGE 20
31
32#define KPF_KSM 21
33
34/* kernel hacking assistances
35 * WARNING: subject to change, never rely on them!
36 */
37#define KPF_RESERVED 32
38#define KPF_MLOCKED 33
39#define KPF_MAPPEDTODISK 34
40#define KPF_PRIVATE 35
41#define KPF_PRIVATE_2 36
42#define KPF_OWNER_PRIVATE 37
43#define KPF_ARCH 38
44#define KPF_UNCACHED 39
45
46#endif /* LINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0b46c2068b96..1f9b119f4ace 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -73,6 +73,7 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
73extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); 73extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
74int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); 74int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
75 75
76extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
76extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); 77extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
77 78
78static inline 79static inline
@@ -85,6 +86,8 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
85 return cgroup == mem; 86 return cgroup == mem;
86} 87}
87 88
89extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem);
90
88extern int 91extern int
89mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr); 92mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr);
90extern void mem_cgroup_end_migration(struct mem_cgroup *mem, 93extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
@@ -202,6 +205,11 @@ mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to)
202{ 205{
203} 206}
204 207
208static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
209{
210 return NULL;
211}
212
205static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem) 213static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem)
206{ 214{
207 return 1; 215 return 1;
@@ -213,6 +221,11 @@ static inline int task_in_mem_cgroup(struct task_struct *task,
213 return 1; 221 return 1;
214} 222}
215 223
224static inline struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
225{
226 return NULL;
227}
228
216static inline int 229static inline int
217mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) 230mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
218{ 231{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9d65ae4ba0e0..849b4a61bd8f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1331,11 +1331,17 @@ extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
1331 size_t size); 1331 size_t size);
1332extern void refund_locked_memory(struct mm_struct *mm, size_t size); 1332extern void refund_locked_memory(struct mm_struct *mm, size_t size);
1333 1333
1334enum mf_flags {
1335 MF_COUNT_INCREASED = 1 << 0,
1336};
1334extern void memory_failure(unsigned long pfn, int trapno); 1337extern void memory_failure(unsigned long pfn, int trapno);
1335extern int __memory_failure(unsigned long pfn, int trapno, int ref); 1338extern int __memory_failure(unsigned long pfn, int trapno, int flags);
1339extern int unpoison_memory(unsigned long pfn);
1336extern int sysctl_memory_failure_early_kill; 1340extern int sysctl_memory_failure_early_kill;
1337extern int sysctl_memory_failure_recovery; 1341extern int sysctl_memory_failure_recovery;
1342extern void shake_page(struct page *p, int access);
1338extern atomic_long_t mce_bad_pages; 1343extern atomic_long_t mce_bad_pages;
1344extern int soft_offline_page(struct page *page, int flags);
1339 1345
1340#endif /* __KERNEL__ */ 1346#endif /* __KERNEL__ */
1341#endif /* _LINUX_MM_H */ 1347#endif /* _LINUX_MM_H */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 49e907bd067f..feee2ba8d06a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -275,13 +275,15 @@ PAGEFLAG_FALSE(Uncached)
275 275
276#ifdef CONFIG_MEMORY_FAILURE 276#ifdef CONFIG_MEMORY_FAILURE
277PAGEFLAG(HWPoison, hwpoison) 277PAGEFLAG(HWPoison, hwpoison)
278TESTSETFLAG(HWPoison, hwpoison) 278TESTSCFLAG(HWPoison, hwpoison)
279#define __PG_HWPOISON (1UL << PG_hwpoison) 279#define __PG_HWPOISON (1UL << PG_hwpoison)
280#else 280#else
281PAGEFLAG_FALSE(HWPoison) 281PAGEFLAG_FALSE(HWPoison)
282#define __PG_HWPOISON 0 282#define __PG_HWPOISON 0
283#endif 283#endif
284 284
285u64 stable_page_flags(struct page *page);
286
285static inline int PageUptodate(struct page *page) 287static inline int PageUptodate(struct page *page)
286{ 288{
287 int ret = test_bit(PG_uptodate, &(page)->flags); 289 int ret = test_bit(PG_uptodate, &(page)->flags);
diff --git a/mm/Kconfig b/mm/Kconfig
index 2310984591ed..43ea8c3a2bbf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -251,8 +251,9 @@ config MEMORY_FAILURE
251 special hardware support and typically ECC memory. 251 special hardware support and typically ECC memory.
252 252
253config HWPOISON_INJECT 253config HWPOISON_INJECT
254 tristate "Poison pages injector" 254 tristate "HWPoison pages injector"
255 depends on MEMORY_FAILURE && DEBUG_KERNEL 255 depends on MEMORY_FAILURE && DEBUG_KERNEL
256 select PROC_PAGE_MONITOR
256 257
257config NOMMU_INITIAL_TRIM_EXCESS 258config NOMMU_INITIAL_TRIM_EXCESS
258 int "Turn on mmap() excess space trimming before booting" 259 int "Turn on mmap() excess space trimming before booting"
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index e1d85137f086..10ea71905c1f 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -3,18 +3,68 @@
3#include <linux/debugfs.h> 3#include <linux/debugfs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/swap.h>
7#include <linux/pagemap.h>
8#include "internal.h"
6 9
7static struct dentry *hwpoison_dir, *corrupt_pfn; 10static struct dentry *hwpoison_dir;
8 11
9static int hwpoison_inject(void *data, u64 val) 12static int hwpoison_inject(void *data, u64 val)
10{ 13{
14 unsigned long pfn = val;
15 struct page *p;
16 int err;
17
18 if (!capable(CAP_SYS_ADMIN))
19 return -EPERM;
20
21 if (!hwpoison_filter_enable)
22 goto inject;
23 if (!pfn_valid(pfn))
24 return -ENXIO;
25
26 p = pfn_to_page(pfn);
27 /*
28 * This implies unable to support free buddy pages.
29 */
30 if (!get_page_unless_zero(p))
31 return 0;
32
33 if (!PageLRU(p))
34 shake_page(p, 0);
35 /*
36 * This implies unable to support non-LRU pages.
37 */
38 if (!PageLRU(p))
39 return 0;
40
41 /*
42 * do a racy check with elevated page count, to make sure PG_hwpoison
43 * will only be set for the targeted owner (or on a free page).
44 * We temporarily take page lock for try_get_mem_cgroup_from_page().
45 * __memory_failure() will redo the check reliably inside page lock.
46 */
47 lock_page(p);
48 err = hwpoison_filter(p);
49 unlock_page(p);
50 if (err)
51 return 0;
52
53inject:
54 printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
55 return __memory_failure(pfn, 18, MF_COUNT_INCREASED);
56}
57
58static int hwpoison_unpoison(void *data, u64 val)
59{
11 if (!capable(CAP_SYS_ADMIN)) 60 if (!capable(CAP_SYS_ADMIN))
12 return -EPERM; 61 return -EPERM;
13 printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); 62
14 return __memory_failure(val, 18, 0); 63 return unpoison_memory(val);
15} 64}
16 65
17DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); 66DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
67DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
18 68
19static void pfn_inject_exit(void) 69static void pfn_inject_exit(void)
20{ 70{
@@ -24,16 +74,63 @@ static void pfn_inject_exit(void)
24 74
25static int pfn_inject_init(void) 75static int pfn_inject_init(void)
26{ 76{
77 struct dentry *dentry;
78
27 hwpoison_dir = debugfs_create_dir("hwpoison", NULL); 79 hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
28 if (hwpoison_dir == NULL) 80 if (hwpoison_dir == NULL)
29 return -ENOMEM; 81 return -ENOMEM;
30 corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, 82
83 /*
84 * Note that the below poison/unpoison interfaces do not involve
85 * hardware status change, hence do not require hardware support.
86 * They are mainly for testing hwpoison in software level.
87 */
88 dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
31 NULL, &hwpoison_fops); 89 NULL, &hwpoison_fops);
32 if (corrupt_pfn == NULL) { 90 if (!dentry)
33 pfn_inject_exit(); 91 goto fail;
34 return -ENOMEM; 92
35 } 93 dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir,
94 NULL, &unpoison_fops);
95 if (!dentry)
96 goto fail;
97
98 dentry = debugfs_create_u32("corrupt-filter-enable", 0600,
99 hwpoison_dir, &hwpoison_filter_enable);
100 if (!dentry)
101 goto fail;
102
103 dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600,
104 hwpoison_dir, &hwpoison_filter_dev_major);
105 if (!dentry)
106 goto fail;
107
108 dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600,
109 hwpoison_dir, &hwpoison_filter_dev_minor);
110 if (!dentry)
111 goto fail;
112
113 dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600,
114 hwpoison_dir, &hwpoison_filter_flags_mask);
115 if (!dentry)
116 goto fail;
117
118 dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600,
119 hwpoison_dir, &hwpoison_filter_flags_value);
120 if (!dentry)
121 goto fail;
122
123#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
124 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
125 hwpoison_dir, &hwpoison_filter_memcg);
126 if (!dentry)
127 goto fail;
128#endif
129
36 return 0; 130 return 0;
131fail:
132 pfn_inject_exit();
133 return -ENOMEM;
37} 134}
38 135
39module_init(pfn_inject_init); 136module_init(pfn_inject_init);
diff --git a/mm/internal.h b/mm/internal.h
index 4fe67a162cb4..6a697bb97fc5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page);
50 */ 50 */
51extern void __free_pages_bootmem(struct page *page, unsigned int order); 51extern void __free_pages_bootmem(struct page *page, unsigned int order);
52extern void prep_compound_page(struct page *page, unsigned long order); 52extern void prep_compound_page(struct page *page, unsigned long order);
53#ifdef CONFIG_MEMORY_FAILURE
54extern bool is_free_buddy_page(struct page *page);
55#endif
53 56
54 57
55/* 58/*
@@ -247,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
247#define ZONE_RECLAIM_SOME 0 250#define ZONE_RECLAIM_SOME 0
248#define ZONE_RECLAIM_SUCCESS 1 251#define ZONE_RECLAIM_SUCCESS 1
249#endif 252#endif
253
254extern int hwpoison_filter(struct page *p);
255
256extern u32 hwpoison_filter_dev_major;
257extern u32 hwpoison_filter_dev_minor;
258extern u64 hwpoison_filter_flags_mask;
259extern u64 hwpoison_filter_flags_value;
260extern u64 hwpoison_filter_memcg;
261extern u32 hwpoison_filter_enable;
diff --git a/mm/madvise.c b/mm/madvise.c
index 35b1479b7c9d..319528b8db74 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -9,6 +9,7 @@
9#include <linux/pagemap.h> 9#include <linux/pagemap.h>
10#include <linux/syscalls.h> 10#include <linux/syscalls.h>
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/page-isolation.h>
12#include <linux/hugetlb.h> 13#include <linux/hugetlb.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/ksm.h> 15#include <linux/ksm.h>
@@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma,
222/* 223/*
223 * Error injection support for memory error handling. 224 * Error injection support for memory error handling.
224 */ 225 */
225static int madvise_hwpoison(unsigned long start, unsigned long end) 226static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
226{ 227{
227 int ret = 0; 228 int ret = 0;
228 229
@@ -230,15 +231,21 @@ static int madvise_hwpoison(unsigned long start, unsigned long end)
230 return -EPERM; 231 return -EPERM;
231 for (; start < end; start += PAGE_SIZE) { 232 for (; start < end; start += PAGE_SIZE) {
232 struct page *p; 233 struct page *p;
233 int ret = get_user_pages(current, current->mm, start, 1, 234 int ret = get_user_pages_fast(start, 1, 0, &p);
234 0, 0, &p, NULL);
235 if (ret != 1) 235 if (ret != 1)
236 return ret; 236 return ret;
237 if (bhv == MADV_SOFT_OFFLINE) {
238 printk(KERN_INFO "Soft offlining page %lx at %lx\n",
239 page_to_pfn(p), start);
240 ret = soft_offline_page(p, MF_COUNT_INCREASED);
241 if (ret)
242 break;
243 continue;
244 }
237 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 245 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
238 page_to_pfn(p), start); 246 page_to_pfn(p), start);
239 /* Ignore return value for now */ 247 /* Ignore return value for now */
240 __memory_failure(page_to_pfn(p), 0, 1); 248 __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
241 put_page(p);
242 } 249 }
243 return ret; 250 return ret;
244} 251}
@@ -335,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
335 size_t len; 342 size_t len;
336 343
337#ifdef CONFIG_MEMORY_FAILURE 344#ifdef CONFIG_MEMORY_FAILURE
338 if (behavior == MADV_HWPOISON) 345 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
339 return madvise_hwpoison(start, start+len_in); 346 return madvise_hwpoison(behavior, start, start+len_in);
340#endif 347#endif
341 if (!madvise_behavior_valid(behavior)) 348 if (!madvise_behavior_valid(behavior))
342 return error; 349 return error;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 878808c4fcbe..488b644e0e8e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -283,6 +283,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
283 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 283 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
284} 284}
285 285
286struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
287{
288 return &mem->css;
289}
290
286static struct mem_cgroup_per_zone * 291static struct mem_cgroup_per_zone *
287page_cgroup_zoneinfo(struct page_cgroup *pc) 292page_cgroup_zoneinfo(struct page_cgroup *pc)
288{ 293{
@@ -1536,25 +1541,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1536 return container_of(css, struct mem_cgroup, css); 1541 return container_of(css, struct mem_cgroup, css);
1537} 1542}
1538 1543
1539static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1544struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1540{ 1545{
1541 struct mem_cgroup *mem; 1546 struct mem_cgroup *mem = NULL;
1542 struct page_cgroup *pc; 1547 struct page_cgroup *pc;
1543 unsigned short id; 1548 unsigned short id;
1544 swp_entry_t ent; 1549 swp_entry_t ent;
1545 1550
1546 VM_BUG_ON(!PageLocked(page)); 1551 VM_BUG_ON(!PageLocked(page));
1547 1552
1548 if (!PageSwapCache(page))
1549 return NULL;
1550
1551 pc = lookup_page_cgroup(page); 1553 pc = lookup_page_cgroup(page);
1552 lock_page_cgroup(pc); 1554 lock_page_cgroup(pc);
1553 if (PageCgroupUsed(pc)) { 1555 if (PageCgroupUsed(pc)) {
1554 mem = pc->mem_cgroup; 1556 mem = pc->mem_cgroup;
1555 if (mem && !css_tryget(&mem->css)) 1557 if (mem && !css_tryget(&mem->css))
1556 mem = NULL; 1558 mem = NULL;
1557 } else { 1559 } else if (PageSwapCache(page)) {
1558 ent.val = page_private(page); 1560 ent.val = page_private(page);
1559 id = lookup_swap_cgroup(ent); 1561 id = lookup_swap_cgroup(ent);
1560 rcu_read_lock(); 1562 rcu_read_lock();
@@ -1874,7 +1876,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1874 */ 1876 */
1875 if (!PageSwapCache(page)) 1877 if (!PageSwapCache(page))
1876 goto charge_cur_mm; 1878 goto charge_cur_mm;
1877 mem = try_get_mem_cgroup_from_swapcache(page); 1879 mem = try_get_mem_cgroup_from_page(page);
1878 if (!mem) 1880 if (!mem)
1879 goto charge_cur_mm; 1881 goto charge_cur_mm;
1880 *ptr = mem; 1882 *ptr = mem;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 50d4f8d7024a..6a0466ed5bfd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -34,12 +34,16 @@
34#include <linux/kernel.h> 34#include <linux/kernel.h>
35#include <linux/mm.h> 35#include <linux/mm.h>
36#include <linux/page-flags.h> 36#include <linux/page-flags.h>
37#include <linux/kernel-page-flags.h>
37#include <linux/sched.h> 38#include <linux/sched.h>
38#include <linux/ksm.h> 39#include <linux/ksm.h>
39#include <linux/rmap.h> 40#include <linux/rmap.h>
40#include <linux/pagemap.h> 41#include <linux/pagemap.h>
41#include <linux/swap.h> 42#include <linux/swap.h>
42#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/migrate.h>
45#include <linux/page-isolation.h>
46#include <linux/suspend.h>
43#include "internal.h" 47#include "internal.h"
44 48
45int sysctl_memory_failure_early_kill __read_mostly = 0; 49int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -48,6 +52,120 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
48 52
49atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); 53atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
50 54
55u32 hwpoison_filter_enable = 0;
56u32 hwpoison_filter_dev_major = ~0U;
57u32 hwpoison_filter_dev_minor = ~0U;
58u64 hwpoison_filter_flags_mask;
59u64 hwpoison_filter_flags_value;
60EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
61EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
62EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
63EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
64EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
65
66static int hwpoison_filter_dev(struct page *p)
67{
68 struct address_space *mapping;
69 dev_t dev;
70
71 if (hwpoison_filter_dev_major == ~0U &&
72 hwpoison_filter_dev_minor == ~0U)
73 return 0;
74
75 /*
76 * page_mapping() does not accept slab page
77 */
78 if (PageSlab(p))
79 return -EINVAL;
80
81 mapping = page_mapping(p);
82 if (mapping == NULL || mapping->host == NULL)
83 return -EINVAL;
84
85 dev = mapping->host->i_sb->s_dev;
86 if (hwpoison_filter_dev_major != ~0U &&
87 hwpoison_filter_dev_major != MAJOR(dev))
88 return -EINVAL;
89 if (hwpoison_filter_dev_minor != ~0U &&
90 hwpoison_filter_dev_minor != MINOR(dev))
91 return -EINVAL;
92
93 return 0;
94}
95
96static int hwpoison_filter_flags(struct page *p)
97{
98 if (!hwpoison_filter_flags_mask)
99 return 0;
100
101 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
102 hwpoison_filter_flags_value)
103 return 0;
104 else
105 return -EINVAL;
106}
107
108/*
109 * This allows stress tests to limit test scope to a collection of tasks
110 * by putting them under some memcg. This prevents killing unrelated/important
111 * processes such as /sbin/init. Note that the target task may share clean
112 * pages with init (eg. libc text), which is harmless. If the target task
113 * share _dirty_ pages with another task B, the test scheme must make sure B
114 * is also included in the memcg. At last, due to race conditions this filter
115 * can only guarantee that the page either belongs to the memcg tasks, or is
116 * a freed page.
117 */
118#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
119u64 hwpoison_filter_memcg;
120EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
121static int hwpoison_filter_task(struct page *p)
122{
123 struct mem_cgroup *mem;
124 struct cgroup_subsys_state *css;
125 unsigned long ino;
126
127 if (!hwpoison_filter_memcg)
128 return 0;
129
130 mem = try_get_mem_cgroup_from_page(p);
131 if (!mem)
132 return -EINVAL;
133
134 css = mem_cgroup_css(mem);
135 /* root_mem_cgroup has NULL dentries */
136 if (!css->cgroup->dentry)
137 return -EINVAL;
138
139 ino = css->cgroup->dentry->d_inode->i_ino;
140 css_put(css);
141
142 if (ino != hwpoison_filter_memcg)
143 return -EINVAL;
144
145 return 0;
146}
147#else
148static int hwpoison_filter_task(struct page *p) { return 0; }
149#endif
150
151int hwpoison_filter(struct page *p)
152{
153 if (!hwpoison_filter_enable)
154 return 0;
155
156 if (hwpoison_filter_dev(p))
157 return -EINVAL;
158
159 if (hwpoison_filter_flags(p))
160 return -EINVAL;
161
162 if (hwpoison_filter_task(p))
163 return -EINVAL;
164
165 return 0;
166}
167EXPORT_SYMBOL_GPL(hwpoison_filter);
168
51/* 169/*
52 * Send all the processes who have the page mapped an ``action optional'' 170 * Send all the processes who have the page mapped an ``action optional''
53 * signal. 171 * signal.
@@ -83,6 +201,36 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
83} 201}
84 202
85/* 203/*
204 * When a unknown page type is encountered drain as many buffers as possible
205 * in the hope to turn the page into a LRU or free page, which we can handle.
206 */
207void shake_page(struct page *p, int access)
208{
209 if (!PageSlab(p)) {
210 lru_add_drain_all();
211 if (PageLRU(p))
212 return;
213 drain_all_pages();
214 if (PageLRU(p) || is_free_buddy_page(p))
215 return;
216 }
217
218 /*
219 * Only all shrink_slab here (which would also
220 * shrink other caches) if access is not potentially fatal.
221 */
222 if (access) {
223 int nr;
224 do {
225 nr = shrink_slab(1000, GFP_KERNEL, 1000);
226 if (page_count(p) == 0)
227 break;
228 } while (nr > 10);
229 }
230}
231EXPORT_SYMBOL_GPL(shake_page);
232
233/*
86 * Kill all processes that have a poisoned page mapped and then isolate 234 * Kill all processes that have a poisoned page mapped and then isolate
87 * the page. 235 * the page.
88 * 236 *
@@ -177,7 +325,6 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
177 * In case something went wrong with munmapping 325 * In case something went wrong with munmapping
178 * make sure the process doesn't catch the 326 * make sure the process doesn't catch the
179 * signal and then access the memory. Just kill it. 327 * signal and then access the memory. Just kill it.
180 * the signal handlers
181 */ 328 */
182 if (fail || tk->addr_valid == 0) { 329 if (fail || tk->addr_valid == 0) {
183 printk(KERN_ERR 330 printk(KERN_ERR
@@ -314,33 +461,49 @@ static void collect_procs(struct page *page, struct list_head *tokill)
314 */ 461 */
315 462
316enum outcome { 463enum outcome {
317 FAILED, /* Error handling failed */ 464 IGNORED, /* Error: cannot be handled */
465 FAILED, /* Error: handling failed */
318 DELAYED, /* Will be handled later */ 466 DELAYED, /* Will be handled later */
319 IGNORED, /* Error safely ignored */
320 RECOVERED, /* Successfully recovered */ 467 RECOVERED, /* Successfully recovered */
321}; 468};
322 469
323static const char *action_name[] = { 470static const char *action_name[] = {
471 [IGNORED] = "Ignored",
324 [FAILED] = "Failed", 472 [FAILED] = "Failed",
325 [DELAYED] = "Delayed", 473 [DELAYED] = "Delayed",
326 [IGNORED] = "Ignored",
327 [RECOVERED] = "Recovered", 474 [RECOVERED] = "Recovered",
328}; 475};
329 476
330/* 477/*
331 * Error hit kernel page. 478 * XXX: It is possible that a page is isolated from LRU cache,
332 * Do nothing, try to be lucky and not touch this instead. For a few cases we 479 * and then kept in swap cache or failed to remove from page cache.
333 * could be more sophisticated. 480 * The page count will stop it from being freed by unpoison.
481 * Stress tests should be aware of this memory leak problem.
334 */ 482 */
335static int me_kernel(struct page *p, unsigned long pfn) 483static int delete_from_lru_cache(struct page *p)
336{ 484{
337 return DELAYED; 485 if (!isolate_lru_page(p)) {
486 /*
487 * Clear sensible page flags, so that the buddy system won't
488 * complain when the page is unpoison-and-freed.
489 */
490 ClearPageActive(p);
491 ClearPageUnevictable(p);
492 /*
493 * drop the page count elevated by isolate_lru_page()
494 */
495 page_cache_release(p);
496 return 0;
497 }
498 return -EIO;
338} 499}
339 500
340/* 501/*
341 * Already poisoned page. 502 * Error hit kernel page.
503 * Do nothing, try to be lucky and not touch this instead. For a few cases we
504 * could be more sophisticated.
342 */ 505 */
343static int me_ignore(struct page *p, unsigned long pfn) 506static int me_kernel(struct page *p, unsigned long pfn)
344{ 507{
345 return IGNORED; 508 return IGNORED;
346} 509}
@@ -355,14 +518,6 @@ static int me_unknown(struct page *p, unsigned long pfn)
355} 518}
356 519
357/* 520/*
358 * Free memory
359 */
360static int me_free(struct page *p, unsigned long pfn)
361{
362 return DELAYED;
363}
364
365/*
366 * Clean (or cleaned) page cache page. 521 * Clean (or cleaned) page cache page.
367 */ 522 */
368static int me_pagecache_clean(struct page *p, unsigned long pfn) 523static int me_pagecache_clean(struct page *p, unsigned long pfn)
@@ -371,6 +526,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
371 int ret = FAILED; 526 int ret = FAILED;
372 struct address_space *mapping; 527 struct address_space *mapping;
373 528
529 delete_from_lru_cache(p);
530
374 /* 531 /*
375 * For anonymous pages we're done the only reference left 532 * For anonymous pages we're done the only reference left
376 * should be the one m_f() holds. 533 * should be the one m_f() holds.
@@ -500,14 +657,20 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
500 /* Trigger EIO in shmem: */ 657 /* Trigger EIO in shmem: */
501 ClearPageUptodate(p); 658 ClearPageUptodate(p);
502 659
503 return DELAYED; 660 if (!delete_from_lru_cache(p))
661 return DELAYED;
662 else
663 return FAILED;
504} 664}
505 665
506static int me_swapcache_clean(struct page *p, unsigned long pfn) 666static int me_swapcache_clean(struct page *p, unsigned long pfn)
507{ 667{
508 delete_from_swap_cache(p); 668 delete_from_swap_cache(p);
509 669
510 return RECOVERED; 670 if (!delete_from_lru_cache(p))
671 return RECOVERED;
672 else
673 return FAILED;
511} 674}
512 675
513/* 676/*
@@ -550,7 +713,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
550#define tail (1UL << PG_tail) 713#define tail (1UL << PG_tail)
551#define compound (1UL << PG_compound) 714#define compound (1UL << PG_compound)
552#define slab (1UL << PG_slab) 715#define slab (1UL << PG_slab)
553#define buddy (1UL << PG_buddy)
554#define reserved (1UL << PG_reserved) 716#define reserved (1UL << PG_reserved)
555 717
556static struct page_state { 718static struct page_state {
@@ -559,8 +721,11 @@ static struct page_state {
559 char *msg; 721 char *msg;
560 int (*action)(struct page *p, unsigned long pfn); 722 int (*action)(struct page *p, unsigned long pfn);
561} error_states[] = { 723} error_states[] = {
562 { reserved, reserved, "reserved kernel", me_ignore }, 724 { reserved, reserved, "reserved kernel", me_kernel },
563 { buddy, buddy, "free kernel", me_free }, 725 /*
726 * free pages are specially detected outside this table:
727 * PG_buddy pages only make a small fraction of all free pages.
728 */
564 729
565 /* 730 /*
566 * Could in theory check if slab page is free or if we can drop 731 * Could in theory check if slab page is free or if we can drop
@@ -587,7 +752,6 @@ static struct page_state {
587 752
588 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, 753 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
589 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 754 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
590 { swapbacked, swapbacked, "anonymous", me_pagecache_clean },
591 755
592 /* 756 /*
593 * Catchall entry: must be at end. 757 * Catchall entry: must be at end.
@@ -595,20 +759,31 @@ static struct page_state {
595 { 0, 0, "unknown page state", me_unknown }, 759 { 0, 0, "unknown page state", me_unknown },
596}; 760};
597 761
762#undef dirty
763#undef sc
764#undef unevict
765#undef mlock
766#undef writeback
767#undef lru
768#undef swapbacked
769#undef head
770#undef tail
771#undef compound
772#undef slab
773#undef reserved
774
598static void action_result(unsigned long pfn, char *msg, int result) 775static void action_result(unsigned long pfn, char *msg, int result)
599{ 776{
600 struct page *page = NULL; 777 struct page *page = pfn_to_page(pfn);
601 if (pfn_valid(pfn))
602 page = pfn_to_page(pfn);
603 778
604 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", 779 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
605 pfn, 780 pfn,
606 page && PageDirty(page) ? "dirty " : "", 781 PageDirty(page) ? "dirty " : "",
607 msg, action_name[result]); 782 msg, action_name[result]);
608} 783}
609 784
610static int page_action(struct page_state *ps, struct page *p, 785static int page_action(struct page_state *ps, struct page *p,
611 unsigned long pfn, int ref) 786 unsigned long pfn)
612{ 787{
613 int result; 788 int result;
614 int count; 789 int count;
@@ -616,18 +791,22 @@ static int page_action(struct page_state *ps, struct page *p,
616 result = ps->action(p, pfn); 791 result = ps->action(p, pfn);
617 action_result(pfn, ps->msg, result); 792 action_result(pfn, ps->msg, result);
618 793
619 count = page_count(p) - 1 - ref; 794 count = page_count(p) - 1;
620 if (count != 0) 795 if (ps->action == me_swapcache_dirty && result == DELAYED)
796 count--;
797 if (count != 0) {
621 printk(KERN_ERR 798 printk(KERN_ERR
622 "MCE %#lx: %s page still referenced by %d users\n", 799 "MCE %#lx: %s page still referenced by %d users\n",
623 pfn, ps->msg, count); 800 pfn, ps->msg, count);
801 result = FAILED;
802 }
624 803
625 /* Could do more checks here if page looks ok */ 804 /* Could do more checks here if page looks ok */
626 /* 805 /*
627 * Could adjust zone counters here to correct for the missing page. 806 * Could adjust zone counters here to correct for the missing page.
628 */ 807 */
629 808
630 return result == RECOVERED ? 0 : -EBUSY; 809 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
631} 810}
632 811
633#define N_UNMAP_TRIES 5 812#define N_UNMAP_TRIES 5
@@ -636,7 +815,7 @@ static int page_action(struct page_state *ps, struct page *p,
636 * Do all that is necessary to remove user space mappings. Unmap 815 * Do all that is necessary to remove user space mappings. Unmap
637 * the pages and send SIGBUS to the processes if the data was dirty. 816 * the pages and send SIGBUS to the processes if the data was dirty.
638 */ 817 */
639static void hwpoison_user_mappings(struct page *p, unsigned long pfn, 818static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
640 int trapno) 819 int trapno)
641{ 820{
642 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 821 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
@@ -646,15 +825,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
646 int i; 825 int i;
647 int kill = 1; 826 int kill = 1;
648 827
649 if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) 828 if (PageReserved(p) || PageSlab(p))
650 return; 829 return SWAP_SUCCESS;
651 830
652 /* 831 /*
653 * This check implies we don't kill processes if their pages 832 * This check implies we don't kill processes if their pages
654 * are in the swap cache early. Those are always late kills. 833 * are in the swap cache early. Those are always late kills.
655 */ 834 */
656 if (!page_mapped(p)) 835 if (!page_mapped(p))
657 return; 836 return SWAP_SUCCESS;
837
838 if (PageCompound(p) || PageKsm(p))
839 return SWAP_FAIL;
658 840
659 if (PageSwapCache(p)) { 841 if (PageSwapCache(p)) {
660 printk(KERN_ERR 842 printk(KERN_ERR
@@ -665,6 +847,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
665 /* 847 /*
666 * Propagate the dirty bit from PTEs to struct page first, because we 848 * Propagate the dirty bit from PTEs to struct page first, because we
667 * need this to decide if we should kill or just drop the page. 849 * need this to decide if we should kill or just drop the page.
850 * XXX: the dirty test could be racy: set_page_dirty() may not always
851 * be called inside page lock (it's recommended but not enforced).
668 */ 852 */
669 mapping = page_mapping(p); 853 mapping = page_mapping(p);
670 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { 854 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
@@ -716,11 +900,12 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
716 */ 900 */
717 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 901 kill_procs_ao(&tokill, !!PageDirty(p), trapno,
718 ret != SWAP_SUCCESS, pfn); 902 ret != SWAP_SUCCESS, pfn);
903
904 return ret;
719} 905}
720 906
721int __memory_failure(unsigned long pfn, int trapno, int ref) 907int __memory_failure(unsigned long pfn, int trapno, int flags)
722{ 908{
723 unsigned long lru_flag;
724 struct page_state *ps; 909 struct page_state *ps;
725 struct page *p; 910 struct page *p;
726 int res; 911 int res;
@@ -729,13 +914,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
729 panic("Memory failure from trap %d on page %lx", trapno, pfn); 914 panic("Memory failure from trap %d on page %lx", trapno, pfn);
730 915
731 if (!pfn_valid(pfn)) { 916 if (!pfn_valid(pfn)) {
732 action_result(pfn, "memory outside kernel control", IGNORED); 917 printk(KERN_ERR
733 return -EIO; 918 "MCE %#lx: memory outside kernel control\n",
919 pfn);
920 return -ENXIO;
734 } 921 }
735 922
736 p = pfn_to_page(pfn); 923 p = pfn_to_page(pfn);
737 if (TestSetPageHWPoison(p)) { 924 if (TestSetPageHWPoison(p)) {
738 action_result(pfn, "already hardware poisoned", IGNORED); 925 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
739 return 0; 926 return 0;
740 } 927 }
741 928
@@ -752,9 +939,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
752 * In fact it's dangerous to directly bump up page count from 0, 939 * In fact it's dangerous to directly bump up page count from 0,
753 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 940 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
754 */ 941 */
755 if (!get_page_unless_zero(compound_head(p))) { 942 if (!(flags & MF_COUNT_INCREASED) &&
756 action_result(pfn, "free or high order kernel", IGNORED); 943 !get_page_unless_zero(compound_head(p))) {
757 return PageBuddy(compound_head(p)) ? 0 : -EBUSY; 944 if (is_free_buddy_page(p)) {
945 action_result(pfn, "free buddy", DELAYED);
946 return 0;
947 } else {
948 action_result(pfn, "high order kernel", IGNORED);
949 return -EBUSY;
950 }
758 } 951 }
759 952
760 /* 953 /*
@@ -766,14 +959,19 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
766 * walked by the page reclaim code, however that's not a big loss. 959 * walked by the page reclaim code, however that's not a big loss.
767 */ 960 */
768 if (!PageLRU(p)) 961 if (!PageLRU(p))
769 lru_add_drain_all(); 962 shake_page(p, 0);
770 lru_flag = p->flags & lru; 963 if (!PageLRU(p)) {
771 if (isolate_lru_page(p)) { 964 /*
965 * shake_page could have turned it free.
966 */
967 if (is_free_buddy_page(p)) {
968 action_result(pfn, "free buddy, 2nd try", DELAYED);
969 return 0;
970 }
772 action_result(pfn, "non LRU", IGNORED); 971 action_result(pfn, "non LRU", IGNORED);
773 put_page(p); 972 put_page(p);
774 return -EBUSY; 973 return -EBUSY;
775 } 974 }
776 page_cache_release(p);
777 975
778 /* 976 /*
779 * Lock the page and wait for writeback to finish. 977 * Lock the page and wait for writeback to finish.
@@ -781,26 +979,48 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
781 * and in many cases impossible, so we just avoid it here. 979 * and in many cases impossible, so we just avoid it here.
782 */ 980 */
783 lock_page_nosync(p); 981 lock_page_nosync(p);
982
983 /*
984 * unpoison always clear PG_hwpoison inside page lock
985 */
986 if (!PageHWPoison(p)) {
987 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
988 res = 0;
989 goto out;
990 }
991 if (hwpoison_filter(p)) {
992 if (TestClearPageHWPoison(p))
993 atomic_long_dec(&mce_bad_pages);
994 unlock_page(p);
995 put_page(p);
996 return 0;
997 }
998
784 wait_on_page_writeback(p); 999 wait_on_page_writeback(p);
785 1000
786 /* 1001 /*
787 * Now take care of user space mappings. 1002 * Now take care of user space mappings.
1003 * Abort on fail: __remove_from_page_cache() assumes unmapped page.
788 */ 1004 */
789 hwpoison_user_mappings(p, pfn, trapno); 1005 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
1006 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1007 res = -EBUSY;
1008 goto out;
1009 }
790 1010
791 /* 1011 /*
792 * Torn down by someone else? 1012 * Torn down by someone else?
793 */ 1013 */
794 if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { 1014 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
795 action_result(pfn, "already truncated LRU", IGNORED); 1015 action_result(pfn, "already truncated LRU", IGNORED);
796 res = 0; 1016 res = -EBUSY;
797 goto out; 1017 goto out;
798 } 1018 }
799 1019
800 res = -EBUSY; 1020 res = -EBUSY;
801 for (ps = error_states;; ps++) { 1021 for (ps = error_states;; ps++) {
802 if (((p->flags | lru_flag)& ps->mask) == ps->res) { 1022 if ((p->flags & ps->mask) == ps->res) {
803 res = page_action(ps, p, pfn, ref); 1023 res = page_action(ps, p, pfn);
804 break; 1024 break;
805 } 1025 }
806 } 1026 }
@@ -831,3 +1051,235 @@ void memory_failure(unsigned long pfn, int trapno)
831{ 1051{
832 __memory_failure(pfn, trapno, 0); 1052 __memory_failure(pfn, trapno, 0);
833} 1053}
1054
1055/**
1056 * unpoison_memory - Unpoison a previously poisoned page
1057 * @pfn: Page number of the to be unpoisoned page
1058 *
1059 * Software-unpoison a page that has been poisoned by
1060 * memory_failure() earlier.
1061 *
1062 * This is only done on the software-level, so it only works
1063 * for linux injected failures, not real hardware failures
1064 *
1065 * Returns 0 for success, otherwise -errno.
1066 */
1067int unpoison_memory(unsigned long pfn)
1068{
1069 struct page *page;
1070 struct page *p;
1071 int freeit = 0;
1072
1073 if (!pfn_valid(pfn))
1074 return -ENXIO;
1075
1076 p = pfn_to_page(pfn);
1077 page = compound_head(p);
1078
1079 if (!PageHWPoison(p)) {
1080 pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
1081 return 0;
1082 }
1083
1084 if (!get_page_unless_zero(page)) {
1085 if (TestClearPageHWPoison(p))
1086 atomic_long_dec(&mce_bad_pages);
1087 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
1088 return 0;
1089 }
1090
1091 lock_page_nosync(page);
1092 /*
1093 * This test is racy because PG_hwpoison is set outside of page lock.
1094 * That's acceptable because that won't trigger kernel panic. Instead,
1095 * the PG_hwpoison page will be caught and isolated on the entrance to
1096 * the free buddy page pool.
1097 */
1098 if (TestClearPageHWPoison(p)) {
1099 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
1100 atomic_long_dec(&mce_bad_pages);
1101 freeit = 1;
1102 }
1103 unlock_page(page);
1104
1105 put_page(page);
1106 if (freeit)
1107 put_page(page);
1108
1109 return 0;
1110}
1111EXPORT_SYMBOL(unpoison_memory);
1112
1113static struct page *new_page(struct page *p, unsigned long private, int **x)
1114{
1115 int nid = page_to_nid(p);
1116 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1117}
1118
1119/*
1120 * Safely get reference count of an arbitrary page.
1121 * Returns 0 for a free page, -EIO for a zero refcount page
1122 * that is not free, and 1 for any other page type.
1123 * For 1 the page is returned with increased page count, otherwise not.
1124 */
1125static int get_any_page(struct page *p, unsigned long pfn, int flags)
1126{
1127 int ret;
1128
1129 if (flags & MF_COUNT_INCREASED)
1130 return 1;
1131
1132 /*
1133 * The lock_system_sleep prevents a race with memory hotplug,
1134 * because the isolation assumes there's only a single user.
1135 * This is a big hammer, a better would be nicer.
1136 */
1137 lock_system_sleep();
1138
1139 /*
1140 * Isolate the page, so that it doesn't get reallocated if it
1141 * was free.
1142 */
1143 set_migratetype_isolate(p);
1144 if (!get_page_unless_zero(compound_head(p))) {
1145 if (is_free_buddy_page(p)) {
1146 pr_debug("get_any_page: %#lx free buddy page\n", pfn);
1147 /* Set hwpoison bit while page is still isolated */
1148 SetPageHWPoison(p);
1149 ret = 0;
1150 } else {
1151 pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1152 pfn, p->flags);
1153 ret = -EIO;
1154 }
1155 } else {
1156 /* Not a free page */
1157 ret = 1;
1158 }
1159 unset_migratetype_isolate(p);
1160 unlock_system_sleep();
1161 return ret;
1162}
1163
1164/**
1165 * soft_offline_page - Soft offline a page.
1166 * @page: page to offline
1167 * @flags: flags. Same as memory_failure().
1168 *
1169 * Returns 0 on success, otherwise negated errno.
1170 *
1171 * Soft offline a page, by migration or invalidation,
1172 * without killing anything. This is for the case when
1173 * a page is not corrupted yet (so it's still valid to access),
1174 * but has had a number of corrected errors and is better taken
1175 * out.
1176 *
1177 * The actual policy on when to do that is maintained by
1178 * user space.
1179 *
1180 * This should never impact any application or cause data loss,
1181 * however it might take some time.
1182 *
1183 * This is not a 100% solution for all memory, but tries to be
1184 * ``good enough'' for the majority of memory.
1185 */
1186int soft_offline_page(struct page *page, int flags)
1187{
1188 int ret;
1189 unsigned long pfn = page_to_pfn(page);
1190
1191 ret = get_any_page(page, pfn, flags);
1192 if (ret < 0)
1193 return ret;
1194 if (ret == 0)
1195 goto done;
1196
1197 /*
1198 * Page cache page we can handle?
1199 */
1200 if (!PageLRU(page)) {
1201 /*
1202 * Try to free it.
1203 */
1204 put_page(page);
1205 shake_page(page, 1);
1206
1207 /*
1208 * Did it turn free?
1209 */
1210 ret = get_any_page(page, pfn, 0);
1211 if (ret < 0)
1212 return ret;
1213 if (ret == 0)
1214 goto done;
1215 }
1216 if (!PageLRU(page)) {
1217 pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
1218 pfn, page->flags);
1219 return -EIO;
1220 }
1221
1222 lock_page(page);
1223 wait_on_page_writeback(page);
1224
1225 /*
1226 * Synchronized using the page lock with memory_failure()
1227 */
1228 if (PageHWPoison(page)) {
1229 unlock_page(page);
1230 put_page(page);
1231 pr_debug("soft offline: %#lx page already poisoned\n", pfn);
1232 return -EBUSY;
1233 }
1234
1235 /*
1236 * Try to invalidate first. This should work for
1237 * non dirty unmapped page cache pages.
1238 */
1239 ret = invalidate_inode_page(page);
1240 unlock_page(page);
1241
1242 /*
1243 * Drop count because page migration doesn't like raised
1244 * counts. The page could get re-allocated, but if it becomes
1245 * LRU the isolation will just fail.
1246 * RED-PEN would be better to keep it isolated here, but we
1247 * would need to fix isolation locking first.
1248 */
1249 put_page(page);
1250 if (ret == 1) {
1251 ret = 0;
1252 pr_debug("soft_offline: %#lx: invalidated\n", pfn);
1253 goto done;
1254 }
1255
1256 /*
1257 * Simple invalidation didn't work.
1258 * Try to migrate to a new page instead. migrate.c
1259 * handles a large number of cases for us.
1260 */
1261 ret = isolate_lru_page(page);
1262 if (!ret) {
1263 LIST_HEAD(pagelist);
1264
1265 list_add(&page->lru, &pagelist);
1266 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1267 if (ret) {
1268 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1269 pfn, ret, page->flags);
1270 if (ret > 0)
1271 ret = -EIO;
1272 }
1273 } else {
1274 pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1275 pfn, ret, page_count(page), page->flags);
1276 }
1277 if (ret)
1278 return ret;
1279
1280done:
1281 atomic_long_add(1, &mce_bad_pages);
1282 SetPageHWPoison(page);
1283 /* keep elevated page count for bad page */
1284 return ret;
1285}
diff --git a/mm/memory.c b/mm/memory.c
index aed45eaf8ac9..09e4b1be7b67 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2555,6 +2555,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2555 ret = VM_FAULT_MAJOR; 2555 ret = VM_FAULT_MAJOR;
2556 count_vm_event(PGMAJFAULT); 2556 count_vm_event(PGMAJFAULT);
2557 } else if (PageHWPoison(page)) { 2557 } else if (PageHWPoison(page)) {
2558 /*
2559 * hwpoisoned dirty swapcache pages are kept for killing
2560 * owner processes (which may be unknown at hwpoison time)
2561 */
2558 ret = VM_FAULT_HWPOISON; 2562 ret = VM_FAULT_HWPOISON;
2559 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2563 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2560 goto out_release; 2564 goto out_release;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 850c4a7e2fe5..74af449b1f1d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5091,3 +5091,24 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5091 spin_unlock_irqrestore(&zone->lock, flags); 5091 spin_unlock_irqrestore(&zone->lock, flags);
5092} 5092}
5093#endif 5093#endif
5094
5095#ifdef CONFIG_MEMORY_FAILURE
5096bool is_free_buddy_page(struct page *page)
5097{
5098 struct zone *zone = page_zone(page);
5099 unsigned long pfn = page_to_pfn(page);
5100 unsigned long flags;
5101 int order;
5102
5103 spin_lock_irqsave(&zone->lock, flags);
5104 for (order = 0; order < MAX_ORDER; order++) {
5105 struct page *page_head = page - (pfn & ((1 << order) - 1));
5106
5107 if (PageBuddy(page_head) && page_order(page_head) >= order)
5108 break;
5109 }
5110 spin_unlock_irqrestore(&zone->lock, flags);
5111
5112 return order < MAX_ORDER;
5113}
5114#endif