diff options
| -rw-r--r-- | Documentation/ABI/testing/sysfs-memory-page-offline | 44 | ||||
| -rw-r--r-- | Documentation/vm/hwpoison.txt | 52 | ||||
| -rw-r--r-- | Documentation/vm/page-types.c | 15 | ||||
| -rw-r--r-- | MAINTAINERS | 9 | ||||
| -rw-r--r-- | drivers/base/memory.c | 61 | ||||
| -rw-r--r-- | fs/proc/page.c | 45 | ||||
| -rw-r--r-- | include/asm-generic/mman-common.h | 1 | ||||
| -rw-r--r-- | include/linux/kernel-page-flags.h | 46 | ||||
| -rw-r--r-- | include/linux/memcontrol.h | 13 | ||||
| -rw-r--r-- | include/linux/mm.h | 8 | ||||
| -rw-r--r-- | include/linux/page-flags.h | 4 | ||||
| -rw-r--r-- | mm/Kconfig | 3 | ||||
| -rw-r--r-- | mm/hwpoison-inject.c | 113 | ||||
| -rw-r--r-- | mm/internal.h | 12 | ||||
| -rw-r--r-- | mm/madvise.c | 21 | ||||
| -rw-r--r-- | mm/memcontrol.c | 16 | ||||
| -rw-r--r-- | mm/memory-failure.c | 560 | ||||
| -rw-r--r-- | mm/memory.c | 4 | ||||
| -rw-r--r-- | mm/page_alloc.c | 21 |
19 files changed, 922 insertions, 126 deletions
diff --git a/Documentation/ABI/testing/sysfs-memory-page-offline b/Documentation/ABI/testing/sysfs-memory-page-offline new file mode 100644 index 000000000000..e14703f12fdf --- /dev/null +++ b/Documentation/ABI/testing/sysfs-memory-page-offline | |||
| @@ -0,0 +1,44 @@ | |||
| 1 | What: /sys/devices/system/memory/soft_offline_page | ||
| 2 | Date: Sep 2009 | ||
| 3 | KernelVersion: 2.6.33 | ||
| 4 | Contact: andi@firstfloor.org | ||
| 5 | Description: | ||
| 6 | Soft-offline the memory page containing the physical address | ||
| 7 | written into this file. Input is a hex number specifying the | ||
| 8 | physical address of the page. The kernel will then attempt | ||
| 9 | to soft-offline it, by moving the contents elsewhere or | ||
| 10 | dropping it if possible. The kernel will then be placed | ||
| 11 | on the bad page list and never be reused. | ||
| 12 | |||
| 13 | The offlining is done in kernel specific granuality. | ||
| 14 | Normally it's the base page size of the kernel, but | ||
| 15 | this might change. | ||
| 16 | |||
| 17 | The page must be still accessible, not poisoned. The | ||
| 18 | kernel will never kill anything for this, but rather | ||
| 19 | fail the offline. Return value is the size of the | ||
| 20 | number, or a error when the offlining failed. Reading | ||
| 21 | the file is not allowed. | ||
| 22 | |||
| 23 | What: /sys/devices/system/memory/hard_offline_page | ||
| 24 | Date: Sep 2009 | ||
| 25 | KernelVersion: 2.6.33 | ||
| 26 | Contact: andi@firstfloor.org | ||
| 27 | Description: | ||
| 28 | Hard-offline the memory page containing the physical | ||
| 29 | address written into this file. Input is a hex number | ||
| 30 | specifying the physical address of the page. The | ||
| 31 | kernel will then attempt to hard-offline the page, by | ||
| 32 | trying to drop the page or killing any owner or | ||
| 33 | triggering IO errors if needed. Note this may kill | ||
| 34 | any processes owning the page. The kernel will avoid | ||
| 35 | to access this page assuming it's poisoned by the | ||
| 36 | hardware. | ||
| 37 | |||
| 38 | The offlining is done in kernel specific granuality. | ||
| 39 | Normally it's the base page size of the kernel, but | ||
| 40 | this might change. | ||
| 41 | |||
| 42 | Return value is the size of the number, or a error when | ||
| 43 | the offlining failed. | ||
| 44 | Reading the file is not allowed. | ||
diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index 3ffadf8da61f..12f9ba20ccb7 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt | |||
| @@ -92,16 +92,62 @@ PR_MCE_KILL_GET | |||
| 92 | 92 | ||
| 93 | Testing: | 93 | Testing: |
| 94 | 94 | ||
| 95 | madvise(MADV_POISON, ....) | 95 | madvise(MADV_HWPOISON, ....) |
| 96 | (as root) | 96 | (as root) |
| 97 | Poison a page in the process for testing | 97 | Poison a page in the process for testing |
| 98 | 98 | ||
| 99 | 99 | ||
| 100 | hwpoison-inject module through debugfs | 100 | hwpoison-inject module through debugfs |
| 101 | /sys/debug/hwpoison/corrupt-pfn | ||
| 102 | 101 | ||
| 103 | Inject hwpoison fault at PFN echoed into this file | 102 | /sys/debug/hwpoison/ |
| 104 | 103 | ||
| 104 | corrupt-pfn | ||
| 105 | |||
| 106 | Inject hwpoison fault at PFN echoed into this file. This does | ||
| 107 | some early filtering to avoid corrupted unintended pages in test suites. | ||
| 108 | |||
| 109 | unpoison-pfn | ||
| 110 | |||
| 111 | Software-unpoison page at PFN echoed into this file. This | ||
| 112 | way a page can be reused again. | ||
| 113 | This only works for Linux injected failures, not for real | ||
| 114 | memory failures. | ||
| 115 | |||
| 116 | Note these injection interfaces are not stable and might change between | ||
| 117 | kernel versions | ||
| 118 | |||
| 119 | corrupt-filter-dev-major | ||
| 120 | corrupt-filter-dev-minor | ||
| 121 | |||
| 122 | Only handle memory failures to pages associated with the file system defined | ||
| 123 | by block device major/minor. -1U is the wildcard value. | ||
| 124 | This should be only used for testing with artificial injection. | ||
| 125 | |||
| 126 | corrupt-filter-memcg | ||
| 127 | |||
| 128 | Limit injection to pages owned by memgroup. Specified by inode number | ||
| 129 | of the memcg. | ||
| 130 | |||
| 131 | Example: | ||
| 132 | mkdir /cgroup/hwpoison | ||
| 133 | |||
| 134 | usemem -m 100 -s 1000 & | ||
| 135 | echo `jobs -p` > /cgroup/hwpoison/tasks | ||
| 136 | |||
| 137 | memcg_ino=$(ls -id /cgroup/hwpoison | cut -f1 -d' ') | ||
| 138 | echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg | ||
| 139 | |||
| 140 | page-types -p `pidof init` --hwpoison # shall do nothing | ||
| 141 | page-types -p `pidof usemem` --hwpoison # poison its pages | ||
| 142 | |||
| 143 | corrupt-filter-flags-mask | ||
| 144 | corrupt-filter-flags-value | ||
| 145 | |||
| 146 | When specified, only poison pages if ((page_flags & mask) == value). | ||
| 147 | This allows stress testing of many kinds of pages. The page_flags | ||
| 148 | are the same as in /proc/kpageflags. The flag bits are defined in | ||
| 149 | include/linux/kernel-page-flags.h and documented in | ||
| 150 | Documentation/vm/pagemap.txt | ||
| 105 | 151 | ||
| 106 | Architecture specific MCE injector | 152 | Architecture specific MCE injector |
| 107 | 153 | ||
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index 7a7d9bab32ef..66e9358e2144 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c | |||
| @@ -1,11 +1,22 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * page-types: Tool for querying page flags | 2 | * page-types: Tool for querying page flags |
| 3 | * | 3 | * |
| 4 | * This program is free software; you can redistribute it and/or modify it | ||
| 5 | * under the terms of the GNU General Public License as published by the Free | ||
| 6 | * Software Foundation; version 2. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but WITHOUT | ||
| 9 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
| 10 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
| 11 | * more details. | ||
| 12 | * | ||
| 13 | * You should find a copy of v2 of the GNU General Public License somewhere on | ||
| 14 | * your Linux system; if not, write to the Free Software Foundation, Inc., 59 | ||
| 15 | * Temple Place, Suite 330, Boston, MA 02111-1307 USA. | ||
| 16 | * | ||
| 4 | * Copyright (C) 2009 Intel corporation | 17 | * Copyright (C) 2009 Intel corporation |
| 5 | * | 18 | * |
| 6 | * Authors: Wu Fengguang <fengguang.wu@intel.com> | 19 | * Authors: Wu Fengguang <fengguang.wu@intel.com> |
| 7 | * | ||
| 8 | * Released under the General Public License (GPL). | ||
| 9 | */ | 20 | */ |
| 10 | 21 | ||
| 11 | #define _LARGEFILE64_SOURCE | 22 | #define _LARGEFILE64_SOURCE |
diff --git a/MAINTAINERS b/MAINTAINERS index d6a27110a747..0699782f8c5b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -2377,6 +2377,15 @@ W: http://www.kernel.org/pub/linux/kernel/people/fseidel/hdaps/ | |||
| 2377 | S: Maintained | 2377 | S: Maintained |
| 2378 | F: drivers/hwmon/hdaps.c | 2378 | F: drivers/hwmon/hdaps.c |
| 2379 | 2379 | ||
| 2380 | HWPOISON MEMORY FAILURE HANDLING | ||
| 2381 | M: Andi Kleen <andi@firstfloor.org> | ||
| 2382 | L: linux-mm@kvack.org | ||
| 2383 | L: linux-kernel@vger.kernel.org | ||
| 2384 | T: git git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6.git hwpoison | ||
| 2385 | S: Maintained | ||
| 2386 | F: mm/memory-failure.c | ||
| 2387 | F: mm/hwpoison-inject.c | ||
| 2388 | |||
| 2380 | HYPERVISOR VIRTUAL CONSOLE DRIVER | 2389 | HYPERVISOR VIRTUAL CONSOLE DRIVER |
| 2381 | L: linuxppc-dev@ozlabs.org | 2390 | L: linuxppc-dev@ozlabs.org |
| 2382 | S: Odd Fixes | 2391 | S: Odd Fixes |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 989429cfed88..c4c8f2e1dd15 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
| @@ -341,6 +341,64 @@ static inline int memory_probe_init(void) | |||
| 341 | } | 341 | } |
| 342 | #endif | 342 | #endif |
| 343 | 343 | ||
| 344 | #ifdef CONFIG_MEMORY_FAILURE | ||
| 345 | /* | ||
| 346 | * Support for offlining pages of memory | ||
| 347 | */ | ||
| 348 | |||
| 349 | /* Soft offline a page */ | ||
| 350 | static ssize_t | ||
| 351 | store_soft_offline_page(struct class *class, const char *buf, size_t count) | ||
| 352 | { | ||
| 353 | int ret; | ||
| 354 | u64 pfn; | ||
| 355 | if (!capable(CAP_SYS_ADMIN)) | ||
| 356 | return -EPERM; | ||
| 357 | if (strict_strtoull(buf, 0, &pfn) < 0) | ||
| 358 | return -EINVAL; | ||
| 359 | pfn >>= PAGE_SHIFT; | ||
| 360 | if (!pfn_valid(pfn)) | ||
| 361 | return -ENXIO; | ||
| 362 | ret = soft_offline_page(pfn_to_page(pfn), 0); | ||
| 363 | return ret == 0 ? count : ret; | ||
| 364 | } | ||
| 365 | |||
| 366 | /* Forcibly offline a page, including killing processes. */ | ||
| 367 | static ssize_t | ||
| 368 | store_hard_offline_page(struct class *class, const char *buf, size_t count) | ||
| 369 | { | ||
| 370 | int ret; | ||
| 371 | u64 pfn; | ||
| 372 | if (!capable(CAP_SYS_ADMIN)) | ||
| 373 | return -EPERM; | ||
| 374 | if (strict_strtoull(buf, 0, &pfn) < 0) | ||
| 375 | return -EINVAL; | ||
| 376 | pfn >>= PAGE_SHIFT; | ||
| 377 | ret = __memory_failure(pfn, 0, 0); | ||
| 378 | return ret ? ret : count; | ||
| 379 | } | ||
| 380 | |||
| 381 | static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); | ||
| 382 | static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); | ||
| 383 | |||
| 384 | static __init int memory_fail_init(void) | ||
| 385 | { | ||
| 386 | int err; | ||
| 387 | |||
| 388 | err = sysfs_create_file(&memory_sysdev_class.kset.kobj, | ||
| 389 | &class_attr_soft_offline_page.attr); | ||
| 390 | if (!err) | ||
| 391 | err = sysfs_create_file(&memory_sysdev_class.kset.kobj, | ||
| 392 | &class_attr_hard_offline_page.attr); | ||
| 393 | return err; | ||
| 394 | } | ||
| 395 | #else | ||
| 396 | static inline int memory_fail_init(void) | ||
| 397 | { | ||
| 398 | return 0; | ||
| 399 | } | ||
| 400 | #endif | ||
| 401 | |||
| 344 | /* | 402 | /* |
| 345 | * Note that phys_device is optional. It is here to allow for | 403 | * Note that phys_device is optional. It is here to allow for |
| 346 | * differentiation between which *physical* devices each | 404 | * differentiation between which *physical* devices each |
| @@ -473,6 +531,9 @@ int __init memory_dev_init(void) | |||
| 473 | err = memory_probe_init(); | 531 | err = memory_probe_init(); |
| 474 | if (!ret) | 532 | if (!ret) |
| 475 | ret = err; | 533 | ret = err; |
| 534 | err = memory_fail_init(); | ||
| 535 | if (!ret) | ||
| 536 | ret = err; | ||
| 476 | err = block_size_init(); | 537 | err = block_size_init(); |
| 477 | if (!ret) | 538 | if (!ret) |
| 478 | ret = err; | 539 | ret = err; |
diff --git a/fs/proc/page.c b/fs/proc/page.c index 5033ce0d254b..180cf5a0bd67 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | #include <linux/proc_fs.h> | 8 | #include <linux/proc_fs.h> |
| 9 | #include <linux/seq_file.h> | 9 | #include <linux/seq_file.h> |
| 10 | #include <linux/hugetlb.h> | 10 | #include <linux/hugetlb.h> |
| 11 | #include <linux/kernel-page-flags.h> | ||
| 11 | #include <asm/uaccess.h> | 12 | #include <asm/uaccess.h> |
| 12 | #include "internal.h" | 13 | #include "internal.h" |
| 13 | 14 | ||
| @@ -71,52 +72,12 @@ static const struct file_operations proc_kpagecount_operations = { | |||
| 71 | * physical page flags. | 72 | * physical page flags. |
| 72 | */ | 73 | */ |
| 73 | 74 | ||
| 74 | /* These macros are used to decouple internal flags from exported ones */ | ||
| 75 | |||
| 76 | #define KPF_LOCKED 0 | ||
| 77 | #define KPF_ERROR 1 | ||
| 78 | #define KPF_REFERENCED 2 | ||
| 79 | #define KPF_UPTODATE 3 | ||
| 80 | #define KPF_DIRTY 4 | ||
| 81 | #define KPF_LRU 5 | ||
| 82 | #define KPF_ACTIVE 6 | ||
| 83 | #define KPF_SLAB 7 | ||
| 84 | #define KPF_WRITEBACK 8 | ||
| 85 | #define KPF_RECLAIM 9 | ||
| 86 | #define KPF_BUDDY 10 | ||
| 87 | |||
| 88 | /* 11-20: new additions in 2.6.31 */ | ||
| 89 | #define KPF_MMAP 11 | ||
| 90 | #define KPF_ANON 12 | ||
| 91 | #define KPF_SWAPCACHE 13 | ||
| 92 | #define KPF_SWAPBACKED 14 | ||
| 93 | #define KPF_COMPOUND_HEAD 15 | ||
| 94 | #define KPF_COMPOUND_TAIL 16 | ||
| 95 | #define KPF_HUGE 17 | ||
| 96 | #define KPF_UNEVICTABLE 18 | ||
| 97 | #define KPF_HWPOISON 19 | ||
| 98 | #define KPF_NOPAGE 20 | ||
| 99 | |||
| 100 | #define KPF_KSM 21 | ||
| 101 | |||
| 102 | /* kernel hacking assistances | ||
| 103 | * WARNING: subject to change, never rely on them! | ||
| 104 | */ | ||
| 105 | #define KPF_RESERVED 32 | ||
| 106 | #define KPF_MLOCKED 33 | ||
| 107 | #define KPF_MAPPEDTODISK 34 | ||
| 108 | #define KPF_PRIVATE 35 | ||
| 109 | #define KPF_PRIVATE_2 36 | ||
| 110 | #define KPF_OWNER_PRIVATE 37 | ||
| 111 | #define KPF_ARCH 38 | ||
| 112 | #define KPF_UNCACHED 39 | ||
| 113 | |||
| 114 | static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) | 75 | static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) |
| 115 | { | 76 | { |
| 116 | return ((kflags >> kbit) & 1) << ubit; | 77 | return ((kflags >> kbit) & 1) << ubit; |
| 117 | } | 78 | } |
| 118 | 79 | ||
| 119 | static u64 get_uflags(struct page *page) | 80 | u64 stable_page_flags(struct page *page) |
| 120 | { | 81 | { |
| 121 | u64 k; | 82 | u64 k; |
| 122 | u64 u; | 83 | u64 u; |
| @@ -219,7 +180,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, | |||
| 219 | else | 180 | else |
| 220 | ppage = NULL; | 181 | ppage = NULL; |
| 221 | 182 | ||
| 222 | if (put_user(get_uflags(ppage), out)) { | 183 | if (put_user(stable_page_flags(ppage), out)) { |
| 223 | ret = -EFAULT; | 184 | ret = -EFAULT; |
| 224 | break; | 185 | break; |
| 225 | } | 186 | } |
diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h index 20111265afd8..3da9e2742fa0 100644 --- a/include/asm-generic/mman-common.h +++ b/include/asm-generic/mman-common.h | |||
| @@ -40,6 +40,7 @@ | |||
| 40 | #define MADV_DONTFORK 10 /* don't inherit across fork */ | 40 | #define MADV_DONTFORK 10 /* don't inherit across fork */ |
| 41 | #define MADV_DOFORK 11 /* do inherit across fork */ | 41 | #define MADV_DOFORK 11 /* do inherit across fork */ |
| 42 | #define MADV_HWPOISON 100 /* poison a page for testing */ | 42 | #define MADV_HWPOISON 100 /* poison a page for testing */ |
| 43 | #define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ | ||
| 43 | 44 | ||
| 44 | #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ | 45 | #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ |
| 45 | #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ | 46 | #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ |
diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h new file mode 100644 index 000000000000..bd92a89f4b0a --- /dev/null +++ b/include/linux/kernel-page-flags.h | |||
| @@ -0,0 +1,46 @@ | |||
| 1 | #ifndef LINUX_KERNEL_PAGE_FLAGS_H | ||
| 2 | #define LINUX_KERNEL_PAGE_FLAGS_H | ||
| 3 | |||
| 4 | /* | ||
| 5 | * Stable page flag bits exported to user space | ||
| 6 | */ | ||
| 7 | |||
| 8 | #define KPF_LOCKED 0 | ||
| 9 | #define KPF_ERROR 1 | ||
| 10 | #define KPF_REFERENCED 2 | ||
| 11 | #define KPF_UPTODATE 3 | ||
| 12 | #define KPF_DIRTY 4 | ||
| 13 | #define KPF_LRU 5 | ||
| 14 | #define KPF_ACTIVE 6 | ||
| 15 | #define KPF_SLAB 7 | ||
| 16 | #define KPF_WRITEBACK 8 | ||
| 17 | #define KPF_RECLAIM 9 | ||
| 18 | #define KPF_BUDDY 10 | ||
| 19 | |||
| 20 | /* 11-20: new additions in 2.6.31 */ | ||
| 21 | #define KPF_MMAP 11 | ||
| 22 | #define KPF_ANON 12 | ||
| 23 | #define KPF_SWAPCACHE 13 | ||
| 24 | #define KPF_SWAPBACKED 14 | ||
| 25 | #define KPF_COMPOUND_HEAD 15 | ||
| 26 | #define KPF_COMPOUND_TAIL 16 | ||
| 27 | #define KPF_HUGE 17 | ||
| 28 | #define KPF_UNEVICTABLE 18 | ||
| 29 | #define KPF_HWPOISON 19 | ||
| 30 | #define KPF_NOPAGE 20 | ||
| 31 | |||
| 32 | #define KPF_KSM 21 | ||
| 33 | |||
| 34 | /* kernel hacking assistances | ||
| 35 | * WARNING: subject to change, never rely on them! | ||
| 36 | */ | ||
| 37 | #define KPF_RESERVED 32 | ||
| 38 | #define KPF_MLOCKED 33 | ||
| 39 | #define KPF_MAPPEDTODISK 34 | ||
| 40 | #define KPF_PRIVATE 35 | ||
| 41 | #define KPF_PRIVATE_2 36 | ||
| 42 | #define KPF_OWNER_PRIVATE 37 | ||
| 43 | #define KPF_ARCH 38 | ||
| 44 | #define KPF_UNCACHED 39 | ||
| 45 | |||
| 46 | #endif /* LINUX_KERNEL_PAGE_FLAGS_H */ | ||
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0b46c2068b96..1f9b119f4ace 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
| @@ -73,6 +73,7 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
| 73 | extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); | 73 | extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); |
| 74 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); | 74 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); |
| 75 | 75 | ||
| 76 | extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); | ||
| 76 | extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); | 77 | extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); |
| 77 | 78 | ||
| 78 | static inline | 79 | static inline |
| @@ -85,6 +86,8 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) | |||
| 85 | return cgroup == mem; | 86 | return cgroup == mem; |
| 86 | } | 87 | } |
| 87 | 88 | ||
| 89 | extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem); | ||
| 90 | |||
| 88 | extern int | 91 | extern int |
| 89 | mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr); | 92 | mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr); |
| 90 | extern void mem_cgroup_end_migration(struct mem_cgroup *mem, | 93 | extern void mem_cgroup_end_migration(struct mem_cgroup *mem, |
| @@ -202,6 +205,11 @@ mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to) | |||
| 202 | { | 205 | { |
| 203 | } | 206 | } |
| 204 | 207 | ||
| 208 | static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | ||
| 209 | { | ||
| 210 | return NULL; | ||
| 211 | } | ||
| 212 | |||
| 205 | static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem) | 213 | static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem) |
| 206 | { | 214 | { |
| 207 | return 1; | 215 | return 1; |
| @@ -213,6 +221,11 @@ static inline int task_in_mem_cgroup(struct task_struct *task, | |||
| 213 | return 1; | 221 | return 1; |
| 214 | } | 222 | } |
| 215 | 223 | ||
| 224 | static inline struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | ||
| 225 | { | ||
| 226 | return NULL; | ||
| 227 | } | ||
| 228 | |||
| 216 | static inline int | 229 | static inline int |
| 217 | mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | 230 | mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) |
| 218 | { | 231 | { |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 9d65ae4ba0e0..849b4a61bd8f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
| @@ -1331,11 +1331,17 @@ extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, | |||
| 1331 | size_t size); | 1331 | size_t size); |
| 1332 | extern void refund_locked_memory(struct mm_struct *mm, size_t size); | 1332 | extern void refund_locked_memory(struct mm_struct *mm, size_t size); |
| 1333 | 1333 | ||
| 1334 | enum mf_flags { | ||
| 1335 | MF_COUNT_INCREASED = 1 << 0, | ||
| 1336 | }; | ||
| 1334 | extern void memory_failure(unsigned long pfn, int trapno); | 1337 | extern void memory_failure(unsigned long pfn, int trapno); |
| 1335 | extern int __memory_failure(unsigned long pfn, int trapno, int ref); | 1338 | extern int __memory_failure(unsigned long pfn, int trapno, int flags); |
| 1339 | extern int unpoison_memory(unsigned long pfn); | ||
| 1336 | extern int sysctl_memory_failure_early_kill; | 1340 | extern int sysctl_memory_failure_early_kill; |
| 1337 | extern int sysctl_memory_failure_recovery; | 1341 | extern int sysctl_memory_failure_recovery; |
| 1342 | extern void shake_page(struct page *p, int access); | ||
| 1338 | extern atomic_long_t mce_bad_pages; | 1343 | extern atomic_long_t mce_bad_pages; |
| 1344 | extern int soft_offline_page(struct page *page, int flags); | ||
| 1339 | 1345 | ||
| 1340 | #endif /* __KERNEL__ */ | 1346 | #endif /* __KERNEL__ */ |
| 1341 | #endif /* _LINUX_MM_H */ | 1347 | #endif /* _LINUX_MM_H */ |
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 49e907bd067f..feee2ba8d06a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h | |||
| @@ -275,13 +275,15 @@ PAGEFLAG_FALSE(Uncached) | |||
| 275 | 275 | ||
| 276 | #ifdef CONFIG_MEMORY_FAILURE | 276 | #ifdef CONFIG_MEMORY_FAILURE |
| 277 | PAGEFLAG(HWPoison, hwpoison) | 277 | PAGEFLAG(HWPoison, hwpoison) |
| 278 | TESTSETFLAG(HWPoison, hwpoison) | 278 | TESTSCFLAG(HWPoison, hwpoison) |
| 279 | #define __PG_HWPOISON (1UL << PG_hwpoison) | 279 | #define __PG_HWPOISON (1UL << PG_hwpoison) |
| 280 | #else | 280 | #else |
| 281 | PAGEFLAG_FALSE(HWPoison) | 281 | PAGEFLAG_FALSE(HWPoison) |
| 282 | #define __PG_HWPOISON 0 | 282 | #define __PG_HWPOISON 0 |
| 283 | #endif | 283 | #endif |
| 284 | 284 | ||
| 285 | u64 stable_page_flags(struct page *page); | ||
| 286 | |||
| 285 | static inline int PageUptodate(struct page *page) | 287 | static inline int PageUptodate(struct page *page) |
| 286 | { | 288 | { |
| 287 | int ret = test_bit(PG_uptodate, &(page)->flags); | 289 | int ret = test_bit(PG_uptodate, &(page)->flags); |
diff --git a/mm/Kconfig b/mm/Kconfig index 2310984591ed..43ea8c3a2bbf 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -251,8 +251,9 @@ config MEMORY_FAILURE | |||
| 251 | special hardware support and typically ECC memory. | 251 | special hardware support and typically ECC memory. |
| 252 | 252 | ||
| 253 | config HWPOISON_INJECT | 253 | config HWPOISON_INJECT |
| 254 | tristate "Poison pages injector" | 254 | tristate "HWPoison pages injector" |
| 255 | depends on MEMORY_FAILURE && DEBUG_KERNEL | 255 | depends on MEMORY_FAILURE && DEBUG_KERNEL |
| 256 | select PROC_PAGE_MONITOR | ||
| 256 | 257 | ||
| 257 | config NOMMU_INITIAL_TRIM_EXCESS | 258 | config NOMMU_INITIAL_TRIM_EXCESS |
| 258 | int "Turn on mmap() excess space trimming before booting" | 259 | int "Turn on mmap() excess space trimming before booting" |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index e1d85137f086..10ea71905c1f 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
| @@ -3,18 +3,68 @@ | |||
| 3 | #include <linux/debugfs.h> | 3 | #include <linux/debugfs.h> |
| 4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
| 5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
| 6 | #include <linux/swap.h> | ||
| 7 | #include <linux/pagemap.h> | ||
| 8 | #include "internal.h" | ||
| 6 | 9 | ||
| 7 | static struct dentry *hwpoison_dir, *corrupt_pfn; | 10 | static struct dentry *hwpoison_dir; |
| 8 | 11 | ||
| 9 | static int hwpoison_inject(void *data, u64 val) | 12 | static int hwpoison_inject(void *data, u64 val) |
| 10 | { | 13 | { |
| 14 | unsigned long pfn = val; | ||
| 15 | struct page *p; | ||
| 16 | int err; | ||
| 17 | |||
| 18 | if (!capable(CAP_SYS_ADMIN)) | ||
| 19 | return -EPERM; | ||
| 20 | |||
| 21 | if (!hwpoison_filter_enable) | ||
| 22 | goto inject; | ||
| 23 | if (!pfn_valid(pfn)) | ||
| 24 | return -ENXIO; | ||
| 25 | |||
| 26 | p = pfn_to_page(pfn); | ||
| 27 | /* | ||
| 28 | * This implies unable to support free buddy pages. | ||
| 29 | */ | ||
| 30 | if (!get_page_unless_zero(p)) | ||
| 31 | return 0; | ||
| 32 | |||
| 33 | if (!PageLRU(p)) | ||
| 34 | shake_page(p, 0); | ||
| 35 | /* | ||
| 36 | * This implies unable to support non-LRU pages. | ||
| 37 | */ | ||
| 38 | if (!PageLRU(p)) | ||
| 39 | return 0; | ||
| 40 | |||
| 41 | /* | ||
| 42 | * do a racy check with elevated page count, to make sure PG_hwpoison | ||
| 43 | * will only be set for the targeted owner (or on a free page). | ||
| 44 | * We temporarily take page lock for try_get_mem_cgroup_from_page(). | ||
| 45 | * __memory_failure() will redo the check reliably inside page lock. | ||
| 46 | */ | ||
| 47 | lock_page(p); | ||
| 48 | err = hwpoison_filter(p); | ||
| 49 | unlock_page(p); | ||
| 50 | if (err) | ||
| 51 | return 0; | ||
| 52 | |||
| 53 | inject: | ||
| 54 | printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); | ||
| 55 | return __memory_failure(pfn, 18, MF_COUNT_INCREASED); | ||
| 56 | } | ||
| 57 | |||
| 58 | static int hwpoison_unpoison(void *data, u64 val) | ||
| 59 | { | ||
| 11 | if (!capable(CAP_SYS_ADMIN)) | 60 | if (!capable(CAP_SYS_ADMIN)) |
| 12 | return -EPERM; | 61 | return -EPERM; |
| 13 | printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); | 62 | |
| 14 | return __memory_failure(val, 18, 0); | 63 | return unpoison_memory(val); |
| 15 | } | 64 | } |
| 16 | 65 | ||
| 17 | DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); | 66 | DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); |
| 67 | DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); | ||
| 18 | 68 | ||
| 19 | static void pfn_inject_exit(void) | 69 | static void pfn_inject_exit(void) |
| 20 | { | 70 | { |
| @@ -24,16 +74,63 @@ static void pfn_inject_exit(void) | |||
| 24 | 74 | ||
| 25 | static int pfn_inject_init(void) | 75 | static int pfn_inject_init(void) |
| 26 | { | 76 | { |
| 77 | struct dentry *dentry; | ||
| 78 | |||
| 27 | hwpoison_dir = debugfs_create_dir("hwpoison", NULL); | 79 | hwpoison_dir = debugfs_create_dir("hwpoison", NULL); |
| 28 | if (hwpoison_dir == NULL) | 80 | if (hwpoison_dir == NULL) |
| 29 | return -ENOMEM; | 81 | return -ENOMEM; |
| 30 | corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | 82 | |
| 83 | /* | ||
| 84 | * Note that the below poison/unpoison interfaces do not involve | ||
| 85 | * hardware status change, hence do not require hardware support. | ||
| 86 | * They are mainly for testing hwpoison in software level. | ||
| 87 | */ | ||
| 88 | dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | ||
| 31 | NULL, &hwpoison_fops); | 89 | NULL, &hwpoison_fops); |
| 32 | if (corrupt_pfn == NULL) { | 90 | if (!dentry) |
| 33 | pfn_inject_exit(); | 91 | goto fail; |
| 34 | return -ENOMEM; | 92 | |
| 35 | } | 93 | dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, |
| 94 | NULL, &unpoison_fops); | ||
| 95 | if (!dentry) | ||
| 96 | goto fail; | ||
| 97 | |||
| 98 | dentry = debugfs_create_u32("corrupt-filter-enable", 0600, | ||
| 99 | hwpoison_dir, &hwpoison_filter_enable); | ||
| 100 | if (!dentry) | ||
| 101 | goto fail; | ||
| 102 | |||
| 103 | dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600, | ||
| 104 | hwpoison_dir, &hwpoison_filter_dev_major); | ||
| 105 | if (!dentry) | ||
| 106 | goto fail; | ||
| 107 | |||
| 108 | dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600, | ||
| 109 | hwpoison_dir, &hwpoison_filter_dev_minor); | ||
| 110 | if (!dentry) | ||
| 111 | goto fail; | ||
| 112 | |||
| 113 | dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600, | ||
| 114 | hwpoison_dir, &hwpoison_filter_flags_mask); | ||
| 115 | if (!dentry) | ||
| 116 | goto fail; | ||
| 117 | |||
| 118 | dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600, | ||
| 119 | hwpoison_dir, &hwpoison_filter_flags_value); | ||
| 120 | if (!dentry) | ||
| 121 | goto fail; | ||
| 122 | |||
| 123 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
| 124 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, | ||
| 125 | hwpoison_dir, &hwpoison_filter_memcg); | ||
| 126 | if (!dentry) | ||
| 127 | goto fail; | ||
| 128 | #endif | ||
| 129 | |||
| 36 | return 0; | 130 | return 0; |
| 131 | fail: | ||
| 132 | pfn_inject_exit(); | ||
| 133 | return -ENOMEM; | ||
| 37 | } | 134 | } |
| 38 | 135 | ||
| 39 | module_init(pfn_inject_init); | 136 | module_init(pfn_inject_init); |
diff --git a/mm/internal.h b/mm/internal.h index 4fe67a162cb4..6a697bb97fc5 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page); | |||
| 50 | */ | 50 | */ |
| 51 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 51 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
| 52 | extern void prep_compound_page(struct page *page, unsigned long order); | 52 | extern void prep_compound_page(struct page *page, unsigned long order); |
| 53 | #ifdef CONFIG_MEMORY_FAILURE | ||
| 54 | extern bool is_free_buddy_page(struct page *page); | ||
| 55 | #endif | ||
| 53 | 56 | ||
| 54 | 57 | ||
| 55 | /* | 58 | /* |
| @@ -247,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 247 | #define ZONE_RECLAIM_SOME 0 | 250 | #define ZONE_RECLAIM_SOME 0 |
| 248 | #define ZONE_RECLAIM_SUCCESS 1 | 251 | #define ZONE_RECLAIM_SUCCESS 1 |
| 249 | #endif | 252 | #endif |
| 253 | |||
| 254 | extern int hwpoison_filter(struct page *p); | ||
| 255 | |||
| 256 | extern u32 hwpoison_filter_dev_major; | ||
| 257 | extern u32 hwpoison_filter_dev_minor; | ||
| 258 | extern u64 hwpoison_filter_flags_mask; | ||
| 259 | extern u64 hwpoison_filter_flags_value; | ||
| 260 | extern u64 hwpoison_filter_memcg; | ||
| 261 | extern u32 hwpoison_filter_enable; | ||
diff --git a/mm/madvise.c b/mm/madvise.c index 35b1479b7c9d..319528b8db74 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | #include <linux/pagemap.h> | 9 | #include <linux/pagemap.h> |
| 10 | #include <linux/syscalls.h> | 10 | #include <linux/syscalls.h> |
| 11 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
| 12 | #include <linux/page-isolation.h> | ||
| 12 | #include <linux/hugetlb.h> | 13 | #include <linux/hugetlb.h> |
| 13 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
| 14 | #include <linux/ksm.h> | 15 | #include <linux/ksm.h> |
| @@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
| 222 | /* | 223 | /* |
| 223 | * Error injection support for memory error handling. | 224 | * Error injection support for memory error handling. |
| 224 | */ | 225 | */ |
| 225 | static int madvise_hwpoison(unsigned long start, unsigned long end) | 226 | static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) |
| 226 | { | 227 | { |
| 227 | int ret = 0; | 228 | int ret = 0; |
| 228 | 229 | ||
| @@ -230,15 +231,21 @@ static int madvise_hwpoison(unsigned long start, unsigned long end) | |||
| 230 | return -EPERM; | 231 | return -EPERM; |
| 231 | for (; start < end; start += PAGE_SIZE) { | 232 | for (; start < end; start += PAGE_SIZE) { |
| 232 | struct page *p; | 233 | struct page *p; |
| 233 | int ret = get_user_pages(current, current->mm, start, 1, | 234 | int ret = get_user_pages_fast(start, 1, 0, &p); |
| 234 | 0, 0, &p, NULL); | ||
| 235 | if (ret != 1) | 235 | if (ret != 1) |
| 236 | return ret; | 236 | return ret; |
| 237 | if (bhv == MADV_SOFT_OFFLINE) { | ||
| 238 | printk(KERN_INFO "Soft offlining page %lx at %lx\n", | ||
| 239 | page_to_pfn(p), start); | ||
| 240 | ret = soft_offline_page(p, MF_COUNT_INCREASED); | ||
| 241 | if (ret) | ||
| 242 | break; | ||
| 243 | continue; | ||
| 244 | } | ||
| 237 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", | 245 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", |
| 238 | page_to_pfn(p), start); | 246 | page_to_pfn(p), start); |
| 239 | /* Ignore return value for now */ | 247 | /* Ignore return value for now */ |
| 240 | __memory_failure(page_to_pfn(p), 0, 1); | 248 | __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); |
| 241 | put_page(p); | ||
| 242 | } | 249 | } |
| 243 | return ret; | 250 | return ret; |
| 244 | } | 251 | } |
| @@ -335,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
| 335 | size_t len; | 342 | size_t len; |
| 336 | 343 | ||
| 337 | #ifdef CONFIG_MEMORY_FAILURE | 344 | #ifdef CONFIG_MEMORY_FAILURE |
| 338 | if (behavior == MADV_HWPOISON) | 345 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) |
| 339 | return madvise_hwpoison(start, start+len_in); | 346 | return madvise_hwpoison(behavior, start, start+len_in); |
| 340 | #endif | 347 | #endif |
| 341 | if (!madvise_behavior_valid(behavior)) | 348 | if (!madvise_behavior_valid(behavior)) |
| 342 | return error; | 349 | return error; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 878808c4fcbe..488b644e0e8e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -283,6 +283,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | |||
| 283 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | 283 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; |
| 284 | } | 284 | } |
| 285 | 285 | ||
| 286 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | ||
| 287 | { | ||
| 288 | return &mem->css; | ||
| 289 | } | ||
| 290 | |||
| 286 | static struct mem_cgroup_per_zone * | 291 | static struct mem_cgroup_per_zone * |
| 287 | page_cgroup_zoneinfo(struct page_cgroup *pc) | 292 | page_cgroup_zoneinfo(struct page_cgroup *pc) |
| 288 | { | 293 | { |
| @@ -1536,25 +1541,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
| 1536 | return container_of(css, struct mem_cgroup, css); | 1541 | return container_of(css, struct mem_cgroup, css); |
| 1537 | } | 1542 | } |
| 1538 | 1543 | ||
| 1539 | static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) | 1544 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
| 1540 | { | 1545 | { |
| 1541 | struct mem_cgroup *mem; | 1546 | struct mem_cgroup *mem = NULL; |
| 1542 | struct page_cgroup *pc; | 1547 | struct page_cgroup *pc; |
| 1543 | unsigned short id; | 1548 | unsigned short id; |
| 1544 | swp_entry_t ent; | 1549 | swp_entry_t ent; |
| 1545 | 1550 | ||
| 1546 | VM_BUG_ON(!PageLocked(page)); | 1551 | VM_BUG_ON(!PageLocked(page)); |
| 1547 | 1552 | ||
| 1548 | if (!PageSwapCache(page)) | ||
| 1549 | return NULL; | ||
| 1550 | |||
| 1551 | pc = lookup_page_cgroup(page); | 1553 | pc = lookup_page_cgroup(page); |
| 1552 | lock_page_cgroup(pc); | 1554 | lock_page_cgroup(pc); |
| 1553 | if (PageCgroupUsed(pc)) { | 1555 | if (PageCgroupUsed(pc)) { |
| 1554 | mem = pc->mem_cgroup; | 1556 | mem = pc->mem_cgroup; |
| 1555 | if (mem && !css_tryget(&mem->css)) | 1557 | if (mem && !css_tryget(&mem->css)) |
| 1556 | mem = NULL; | 1558 | mem = NULL; |
| 1557 | } else { | 1559 | } else if (PageSwapCache(page)) { |
| 1558 | ent.val = page_private(page); | 1560 | ent.val = page_private(page); |
| 1559 | id = lookup_swap_cgroup(ent); | 1561 | id = lookup_swap_cgroup(ent); |
| 1560 | rcu_read_lock(); | 1562 | rcu_read_lock(); |
| @@ -1874,7 +1876,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
| 1874 | */ | 1876 | */ |
| 1875 | if (!PageSwapCache(page)) | 1877 | if (!PageSwapCache(page)) |
| 1876 | goto charge_cur_mm; | 1878 | goto charge_cur_mm; |
| 1877 | mem = try_get_mem_cgroup_from_swapcache(page); | 1879 | mem = try_get_mem_cgroup_from_page(page); |
| 1878 | if (!mem) | 1880 | if (!mem) |
| 1879 | goto charge_cur_mm; | 1881 | goto charge_cur_mm; |
| 1880 | *ptr = mem; | 1882 | *ptr = mem; |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 50d4f8d7024a..6a0466ed5bfd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
| @@ -34,12 +34,16 @@ | |||
| 34 | #include <linux/kernel.h> | 34 | #include <linux/kernel.h> |
| 35 | #include <linux/mm.h> | 35 | #include <linux/mm.h> |
| 36 | #include <linux/page-flags.h> | 36 | #include <linux/page-flags.h> |
| 37 | #include <linux/kernel-page-flags.h> | ||
| 37 | #include <linux/sched.h> | 38 | #include <linux/sched.h> |
| 38 | #include <linux/ksm.h> | 39 | #include <linux/ksm.h> |
| 39 | #include <linux/rmap.h> | 40 | #include <linux/rmap.h> |
| 40 | #include <linux/pagemap.h> | 41 | #include <linux/pagemap.h> |
| 41 | #include <linux/swap.h> | 42 | #include <linux/swap.h> |
| 42 | #include <linux/backing-dev.h> | 43 | #include <linux/backing-dev.h> |
| 44 | #include <linux/migrate.h> | ||
| 45 | #include <linux/page-isolation.h> | ||
| 46 | #include <linux/suspend.h> | ||
| 43 | #include "internal.h" | 47 | #include "internal.h" |
| 44 | 48 | ||
| 45 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 49 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
| @@ -48,6 +52,120 @@ int sysctl_memory_failure_recovery __read_mostly = 1; | |||
| 48 | 52 | ||
| 49 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); | 53 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); |
| 50 | 54 | ||
| 55 | u32 hwpoison_filter_enable = 0; | ||
| 56 | u32 hwpoison_filter_dev_major = ~0U; | ||
| 57 | u32 hwpoison_filter_dev_minor = ~0U; | ||
| 58 | u64 hwpoison_filter_flags_mask; | ||
| 59 | u64 hwpoison_filter_flags_value; | ||
| 60 | EXPORT_SYMBOL_GPL(hwpoison_filter_enable); | ||
| 61 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); | ||
| 62 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); | ||
| 63 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); | ||
| 64 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); | ||
| 65 | |||
| 66 | static int hwpoison_filter_dev(struct page *p) | ||
| 67 | { | ||
| 68 | struct address_space *mapping; | ||
| 69 | dev_t dev; | ||
| 70 | |||
| 71 | if (hwpoison_filter_dev_major == ~0U && | ||
| 72 | hwpoison_filter_dev_minor == ~0U) | ||
| 73 | return 0; | ||
| 74 | |||
| 75 | /* | ||
| 76 | * page_mapping() does not accept slab page | ||
| 77 | */ | ||
| 78 | if (PageSlab(p)) | ||
| 79 | return -EINVAL; | ||
| 80 | |||
| 81 | mapping = page_mapping(p); | ||
| 82 | if (mapping == NULL || mapping->host == NULL) | ||
| 83 | return -EINVAL; | ||
| 84 | |||
| 85 | dev = mapping->host->i_sb->s_dev; | ||
| 86 | if (hwpoison_filter_dev_major != ~0U && | ||
| 87 | hwpoison_filter_dev_major != MAJOR(dev)) | ||
| 88 | return -EINVAL; | ||
| 89 | if (hwpoison_filter_dev_minor != ~0U && | ||
| 90 | hwpoison_filter_dev_minor != MINOR(dev)) | ||
| 91 | return -EINVAL; | ||
| 92 | |||
| 93 | return 0; | ||
| 94 | } | ||
| 95 | |||
| 96 | static int hwpoison_filter_flags(struct page *p) | ||
| 97 | { | ||
| 98 | if (!hwpoison_filter_flags_mask) | ||
| 99 | return 0; | ||
| 100 | |||
| 101 | if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == | ||
| 102 | hwpoison_filter_flags_value) | ||
| 103 | return 0; | ||
| 104 | else | ||
| 105 | return -EINVAL; | ||
| 106 | } | ||
| 107 | |||
| 108 | /* | ||
| 109 | * This allows stress tests to limit test scope to a collection of tasks | ||
| 110 | * by putting them under some memcg. This prevents killing unrelated/important | ||
| 111 | * processes such as /sbin/init. Note that the target task may share clean | ||
| 112 | * pages with init (eg. libc text), which is harmless. If the target task | ||
| 113 | * share _dirty_ pages with another task B, the test scheme must make sure B | ||
| 114 | * is also included in the memcg. At last, due to race conditions this filter | ||
| 115 | * can only guarantee that the page either belongs to the memcg tasks, or is | ||
| 116 | * a freed page. | ||
| 117 | */ | ||
| 118 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
| 119 | u64 hwpoison_filter_memcg; | ||
| 120 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); | ||
| 121 | static int hwpoison_filter_task(struct page *p) | ||
| 122 | { | ||
| 123 | struct mem_cgroup *mem; | ||
| 124 | struct cgroup_subsys_state *css; | ||
| 125 | unsigned long ino; | ||
| 126 | |||
| 127 | if (!hwpoison_filter_memcg) | ||
| 128 | return 0; | ||
| 129 | |||
| 130 | mem = try_get_mem_cgroup_from_page(p); | ||
| 131 | if (!mem) | ||
| 132 | return -EINVAL; | ||
| 133 | |||
| 134 | css = mem_cgroup_css(mem); | ||
| 135 | /* root_mem_cgroup has NULL dentries */ | ||
| 136 | if (!css->cgroup->dentry) | ||
| 137 | return -EINVAL; | ||
| 138 | |||
| 139 | ino = css->cgroup->dentry->d_inode->i_ino; | ||
| 140 | css_put(css); | ||
| 141 | |||
| 142 | if (ino != hwpoison_filter_memcg) | ||
| 143 | return -EINVAL; | ||
| 144 | |||
| 145 | return 0; | ||
| 146 | } | ||
| 147 | #else | ||
| 148 | static int hwpoison_filter_task(struct page *p) { return 0; } | ||
| 149 | #endif | ||
| 150 | |||
| 151 | int hwpoison_filter(struct page *p) | ||
| 152 | { | ||
| 153 | if (!hwpoison_filter_enable) | ||
| 154 | return 0; | ||
| 155 | |||
| 156 | if (hwpoison_filter_dev(p)) | ||
| 157 | return -EINVAL; | ||
| 158 | |||
| 159 | if (hwpoison_filter_flags(p)) | ||
| 160 | return -EINVAL; | ||
| 161 | |||
| 162 | if (hwpoison_filter_task(p)) | ||
| 163 | return -EINVAL; | ||
| 164 | |||
| 165 | return 0; | ||
| 166 | } | ||
| 167 | EXPORT_SYMBOL_GPL(hwpoison_filter); | ||
| 168 | |||
| 51 | /* | 169 | /* |
| 52 | * Send all the processes who have the page mapped an ``action optional'' | 170 | * Send all the processes who have the page mapped an ``action optional'' |
| 53 | * signal. | 171 | * signal. |
| @@ -83,6 +201,36 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | |||
| 83 | } | 201 | } |
| 84 | 202 | ||
| 85 | /* | 203 | /* |
| 204 | * When a unknown page type is encountered drain as many buffers as possible | ||
| 205 | * in the hope to turn the page into a LRU or free page, which we can handle. | ||
| 206 | */ | ||
| 207 | void shake_page(struct page *p, int access) | ||
| 208 | { | ||
| 209 | if (!PageSlab(p)) { | ||
| 210 | lru_add_drain_all(); | ||
| 211 | if (PageLRU(p)) | ||
| 212 | return; | ||
| 213 | drain_all_pages(); | ||
| 214 | if (PageLRU(p) || is_free_buddy_page(p)) | ||
| 215 | return; | ||
| 216 | } | ||
| 217 | |||
| 218 | /* | ||
| 219 | * Only all shrink_slab here (which would also | ||
| 220 | * shrink other caches) if access is not potentially fatal. | ||
| 221 | */ | ||
| 222 | if (access) { | ||
| 223 | int nr; | ||
| 224 | do { | ||
| 225 | nr = shrink_slab(1000, GFP_KERNEL, 1000); | ||
| 226 | if (page_count(p) == 0) | ||
| 227 | break; | ||
| 228 | } while (nr > 10); | ||
| 229 | } | ||
| 230 | } | ||
| 231 | EXPORT_SYMBOL_GPL(shake_page); | ||
| 232 | |||
| 233 | /* | ||
| 86 | * Kill all processes that have a poisoned page mapped and then isolate | 234 | * Kill all processes that have a poisoned page mapped and then isolate |
| 87 | * the page. | 235 | * the page. |
| 88 | * | 236 | * |
| @@ -177,7 +325,6 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, | |||
| 177 | * In case something went wrong with munmapping | 325 | * In case something went wrong with munmapping |
| 178 | * make sure the process doesn't catch the | 326 | * make sure the process doesn't catch the |
| 179 | * signal and then access the memory. Just kill it. | 327 | * signal and then access the memory. Just kill it. |
| 180 | * the signal handlers | ||
| 181 | */ | 328 | */ |
| 182 | if (fail || tk->addr_valid == 0) { | 329 | if (fail || tk->addr_valid == 0) { |
| 183 | printk(KERN_ERR | 330 | printk(KERN_ERR |
| @@ -314,33 +461,49 @@ static void collect_procs(struct page *page, struct list_head *tokill) | |||
| 314 | */ | 461 | */ |
| 315 | 462 | ||
| 316 | enum outcome { | 463 | enum outcome { |
| 317 | FAILED, /* Error handling failed */ | 464 | IGNORED, /* Error: cannot be handled */ |
| 465 | FAILED, /* Error: handling failed */ | ||
| 318 | DELAYED, /* Will be handled later */ | 466 | DELAYED, /* Will be handled later */ |
| 319 | IGNORED, /* Error safely ignored */ | ||
| 320 | RECOVERED, /* Successfully recovered */ | 467 | RECOVERED, /* Successfully recovered */ |
| 321 | }; | 468 | }; |
| 322 | 469 | ||
| 323 | static const char *action_name[] = { | 470 | static const char *action_name[] = { |
| 471 | [IGNORED] = "Ignored", | ||
| 324 | [FAILED] = "Failed", | 472 | [FAILED] = "Failed", |
| 325 | [DELAYED] = "Delayed", | 473 | [DELAYED] = "Delayed", |
| 326 | [IGNORED] = "Ignored", | ||
| 327 | [RECOVERED] = "Recovered", | 474 | [RECOVERED] = "Recovered", |
| 328 | }; | 475 | }; |
| 329 | 476 | ||
| 330 | /* | 477 | /* |
| 331 | * Error hit kernel page. | 478 | * XXX: It is possible that a page is isolated from LRU cache, |
| 332 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | 479 | * and then kept in swap cache or failed to remove from page cache. |
| 333 | * could be more sophisticated. | 480 | * The page count will stop it from being freed by unpoison. |
| 481 | * Stress tests should be aware of this memory leak problem. | ||
| 334 | */ | 482 | */ |
| 335 | static int me_kernel(struct page *p, unsigned long pfn) | 483 | static int delete_from_lru_cache(struct page *p) |
| 336 | { | 484 | { |
| 337 | return DELAYED; | 485 | if (!isolate_lru_page(p)) { |
| 486 | /* | ||
| 487 | * Clear sensible page flags, so that the buddy system won't | ||
| 488 | * complain when the page is unpoison-and-freed. | ||
| 489 | */ | ||
| 490 | ClearPageActive(p); | ||
| 491 | ClearPageUnevictable(p); | ||
| 492 | /* | ||
| 493 | * drop the page count elevated by isolate_lru_page() | ||
| 494 | */ | ||
| 495 | page_cache_release(p); | ||
| 496 | return 0; | ||
| 497 | } | ||
| 498 | return -EIO; | ||
| 338 | } | 499 | } |
| 339 | 500 | ||
| 340 | /* | 501 | /* |
| 341 | * Already poisoned page. | 502 | * Error hit kernel page. |
| 503 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | ||
| 504 | * could be more sophisticated. | ||
| 342 | */ | 505 | */ |
| 343 | static int me_ignore(struct page *p, unsigned long pfn) | 506 | static int me_kernel(struct page *p, unsigned long pfn) |
| 344 | { | 507 | { |
| 345 | return IGNORED; | 508 | return IGNORED; |
| 346 | } | 509 | } |
| @@ -355,14 +518,6 @@ static int me_unknown(struct page *p, unsigned long pfn) | |||
| 355 | } | 518 | } |
| 356 | 519 | ||
| 357 | /* | 520 | /* |
| 358 | * Free memory | ||
| 359 | */ | ||
| 360 | static int me_free(struct page *p, unsigned long pfn) | ||
| 361 | { | ||
| 362 | return DELAYED; | ||
| 363 | } | ||
| 364 | |||
| 365 | /* | ||
| 366 | * Clean (or cleaned) page cache page. | 521 | * Clean (or cleaned) page cache page. |
| 367 | */ | 522 | */ |
| 368 | static int me_pagecache_clean(struct page *p, unsigned long pfn) | 523 | static int me_pagecache_clean(struct page *p, unsigned long pfn) |
| @@ -371,6 +526,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
| 371 | int ret = FAILED; | 526 | int ret = FAILED; |
| 372 | struct address_space *mapping; | 527 | struct address_space *mapping; |
| 373 | 528 | ||
| 529 | delete_from_lru_cache(p); | ||
| 530 | |||
| 374 | /* | 531 | /* |
| 375 | * For anonymous pages we're done the only reference left | 532 | * For anonymous pages we're done the only reference left |
| 376 | * should be the one m_f() holds. | 533 | * should be the one m_f() holds. |
| @@ -500,14 +657,20 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) | |||
| 500 | /* Trigger EIO in shmem: */ | 657 | /* Trigger EIO in shmem: */ |
| 501 | ClearPageUptodate(p); | 658 | ClearPageUptodate(p); |
| 502 | 659 | ||
| 503 | return DELAYED; | 660 | if (!delete_from_lru_cache(p)) |
| 661 | return DELAYED; | ||
| 662 | else | ||
| 663 | return FAILED; | ||
| 504 | } | 664 | } |
| 505 | 665 | ||
| 506 | static int me_swapcache_clean(struct page *p, unsigned long pfn) | 666 | static int me_swapcache_clean(struct page *p, unsigned long pfn) |
| 507 | { | 667 | { |
| 508 | delete_from_swap_cache(p); | 668 | delete_from_swap_cache(p); |
| 509 | 669 | ||
| 510 | return RECOVERED; | 670 | if (!delete_from_lru_cache(p)) |
| 671 | return RECOVERED; | ||
| 672 | else | ||
| 673 | return FAILED; | ||
| 511 | } | 674 | } |
| 512 | 675 | ||
| 513 | /* | 676 | /* |
| @@ -550,7 +713,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
| 550 | #define tail (1UL << PG_tail) | 713 | #define tail (1UL << PG_tail) |
| 551 | #define compound (1UL << PG_compound) | 714 | #define compound (1UL << PG_compound) |
| 552 | #define slab (1UL << PG_slab) | 715 | #define slab (1UL << PG_slab) |
| 553 | #define buddy (1UL << PG_buddy) | ||
| 554 | #define reserved (1UL << PG_reserved) | 716 | #define reserved (1UL << PG_reserved) |
| 555 | 717 | ||
| 556 | static struct page_state { | 718 | static struct page_state { |
| @@ -559,8 +721,11 @@ static struct page_state { | |||
| 559 | char *msg; | 721 | char *msg; |
| 560 | int (*action)(struct page *p, unsigned long pfn); | 722 | int (*action)(struct page *p, unsigned long pfn); |
| 561 | } error_states[] = { | 723 | } error_states[] = { |
| 562 | { reserved, reserved, "reserved kernel", me_ignore }, | 724 | { reserved, reserved, "reserved kernel", me_kernel }, |
| 563 | { buddy, buddy, "free kernel", me_free }, | 725 | /* |
| 726 | * free pages are specially detected outside this table: | ||
| 727 | * PG_buddy pages only make a small fraction of all free pages. | ||
| 728 | */ | ||
| 564 | 729 | ||
| 565 | /* | 730 | /* |
| 566 | * Could in theory check if slab page is free or if we can drop | 731 | * Could in theory check if slab page is free or if we can drop |
| @@ -587,7 +752,6 @@ static struct page_state { | |||
| 587 | 752 | ||
| 588 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, | 753 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, |
| 589 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 754 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, |
| 590 | { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, | ||
| 591 | 755 | ||
| 592 | /* | 756 | /* |
| 593 | * Catchall entry: must be at end. | 757 | * Catchall entry: must be at end. |
| @@ -595,20 +759,31 @@ static struct page_state { | |||
| 595 | { 0, 0, "unknown page state", me_unknown }, | 759 | { 0, 0, "unknown page state", me_unknown }, |
| 596 | }; | 760 | }; |
| 597 | 761 | ||
| 762 | #undef dirty | ||
| 763 | #undef sc | ||
| 764 | #undef unevict | ||
| 765 | #undef mlock | ||
| 766 | #undef writeback | ||
| 767 | #undef lru | ||
| 768 | #undef swapbacked | ||
| 769 | #undef head | ||
| 770 | #undef tail | ||
| 771 | #undef compound | ||
| 772 | #undef slab | ||
| 773 | #undef reserved | ||
| 774 | |||
| 598 | static void action_result(unsigned long pfn, char *msg, int result) | 775 | static void action_result(unsigned long pfn, char *msg, int result) |
| 599 | { | 776 | { |
| 600 | struct page *page = NULL; | 777 | struct page *page = pfn_to_page(pfn); |
| 601 | if (pfn_valid(pfn)) | ||
| 602 | page = pfn_to_page(pfn); | ||
| 603 | 778 | ||
| 604 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", | 779 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", |
| 605 | pfn, | 780 | pfn, |
| 606 | page && PageDirty(page) ? "dirty " : "", | 781 | PageDirty(page) ? "dirty " : "", |
| 607 | msg, action_name[result]); | 782 | msg, action_name[result]); |
| 608 | } | 783 | } |
| 609 | 784 | ||
| 610 | static int page_action(struct page_state *ps, struct page *p, | 785 | static int page_action(struct page_state *ps, struct page *p, |
| 611 | unsigned long pfn, int ref) | 786 | unsigned long pfn) |
| 612 | { | 787 | { |
| 613 | int result; | 788 | int result; |
| 614 | int count; | 789 | int count; |
| @@ -616,18 +791,22 @@ static int page_action(struct page_state *ps, struct page *p, | |||
| 616 | result = ps->action(p, pfn); | 791 | result = ps->action(p, pfn); |
| 617 | action_result(pfn, ps->msg, result); | 792 | action_result(pfn, ps->msg, result); |
| 618 | 793 | ||
| 619 | count = page_count(p) - 1 - ref; | 794 | count = page_count(p) - 1; |
| 620 | if (count != 0) | 795 | if (ps->action == me_swapcache_dirty && result == DELAYED) |
| 796 | count--; | ||
| 797 | if (count != 0) { | ||
| 621 | printk(KERN_ERR | 798 | printk(KERN_ERR |
| 622 | "MCE %#lx: %s page still referenced by %d users\n", | 799 | "MCE %#lx: %s page still referenced by %d users\n", |
| 623 | pfn, ps->msg, count); | 800 | pfn, ps->msg, count); |
| 801 | result = FAILED; | ||
| 802 | } | ||
| 624 | 803 | ||
| 625 | /* Could do more checks here if page looks ok */ | 804 | /* Could do more checks here if page looks ok */ |
| 626 | /* | 805 | /* |
| 627 | * Could adjust zone counters here to correct for the missing page. | 806 | * Could adjust zone counters here to correct for the missing page. |
| 628 | */ | 807 | */ |
| 629 | 808 | ||
| 630 | return result == RECOVERED ? 0 : -EBUSY; | 809 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; |
| 631 | } | 810 | } |
| 632 | 811 | ||
| 633 | #define N_UNMAP_TRIES 5 | 812 | #define N_UNMAP_TRIES 5 |
| @@ -636,7 +815,7 @@ static int page_action(struct page_state *ps, struct page *p, | |||
| 636 | * Do all that is necessary to remove user space mappings. Unmap | 815 | * Do all that is necessary to remove user space mappings. Unmap |
| 637 | * the pages and send SIGBUS to the processes if the data was dirty. | 816 | * the pages and send SIGBUS to the processes if the data was dirty. |
| 638 | */ | 817 | */ |
| 639 | static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | 818 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, |
| 640 | int trapno) | 819 | int trapno) |
| 641 | { | 820 | { |
| 642 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | 821 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
| @@ -646,15 +825,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
| 646 | int i; | 825 | int i; |
| 647 | int kill = 1; | 826 | int kill = 1; |
| 648 | 827 | ||
| 649 | if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) | 828 | if (PageReserved(p) || PageSlab(p)) |
| 650 | return; | 829 | return SWAP_SUCCESS; |
| 651 | 830 | ||
| 652 | /* | 831 | /* |
| 653 | * This check implies we don't kill processes if their pages | 832 | * This check implies we don't kill processes if their pages |
| 654 | * are in the swap cache early. Those are always late kills. | 833 | * are in the swap cache early. Those are always late kills. |
| 655 | */ | 834 | */ |
| 656 | if (!page_mapped(p)) | 835 | if (!page_mapped(p)) |
| 657 | return; | 836 | return SWAP_SUCCESS; |
| 837 | |||
| 838 | if (PageCompound(p) || PageKsm(p)) | ||
| 839 | return SWAP_FAIL; | ||
| 658 | 840 | ||
| 659 | if (PageSwapCache(p)) { | 841 | if (PageSwapCache(p)) { |
| 660 | printk(KERN_ERR | 842 | printk(KERN_ERR |
| @@ -665,6 +847,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
| 665 | /* | 847 | /* |
| 666 | * Propagate the dirty bit from PTEs to struct page first, because we | 848 | * Propagate the dirty bit from PTEs to struct page first, because we |
| 667 | * need this to decide if we should kill or just drop the page. | 849 | * need this to decide if we should kill or just drop the page. |
| 850 | * XXX: the dirty test could be racy: set_page_dirty() may not always | ||
| 851 | * be called inside page lock (it's recommended but not enforced). | ||
| 668 | */ | 852 | */ |
| 669 | mapping = page_mapping(p); | 853 | mapping = page_mapping(p); |
| 670 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { | 854 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { |
| @@ -716,11 +900,12 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
| 716 | */ | 900 | */ |
| 717 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, | 901 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, |
| 718 | ret != SWAP_SUCCESS, pfn); | 902 | ret != SWAP_SUCCESS, pfn); |
| 903 | |||
| 904 | return ret; | ||
| 719 | } | 905 | } |
| 720 | 906 | ||
| 721 | int __memory_failure(unsigned long pfn, int trapno, int ref) | 907 | int __memory_failure(unsigned long pfn, int trapno, int flags) |
| 722 | { | 908 | { |
| 723 | unsigned long lru_flag; | ||
| 724 | struct page_state *ps; | 909 | struct page_state *ps; |
| 725 | struct page *p; | 910 | struct page *p; |
| 726 | int res; | 911 | int res; |
| @@ -729,13 +914,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
| 729 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | 914 | panic("Memory failure from trap %d on page %lx", trapno, pfn); |
| 730 | 915 | ||
| 731 | if (!pfn_valid(pfn)) { | 916 | if (!pfn_valid(pfn)) { |
| 732 | action_result(pfn, "memory outside kernel control", IGNORED); | 917 | printk(KERN_ERR |
| 733 | return -EIO; | 918 | "MCE %#lx: memory outside kernel control\n", |
| 919 | pfn); | ||
| 920 | return -ENXIO; | ||
| 734 | } | 921 | } |
| 735 | 922 | ||
| 736 | p = pfn_to_page(pfn); | 923 | p = pfn_to_page(pfn); |
| 737 | if (TestSetPageHWPoison(p)) { | 924 | if (TestSetPageHWPoison(p)) { |
| 738 | action_result(pfn, "already hardware poisoned", IGNORED); | 925 | printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); |
| 739 | return 0; | 926 | return 0; |
| 740 | } | 927 | } |
| 741 | 928 | ||
| @@ -752,9 +939,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
| 752 | * In fact it's dangerous to directly bump up page count from 0, | 939 | * In fact it's dangerous to directly bump up page count from 0, |
| 753 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. | 940 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. |
| 754 | */ | 941 | */ |
| 755 | if (!get_page_unless_zero(compound_head(p))) { | 942 | if (!(flags & MF_COUNT_INCREASED) && |
| 756 | action_result(pfn, "free or high order kernel", IGNORED); | 943 | !get_page_unless_zero(compound_head(p))) { |
| 757 | return PageBuddy(compound_head(p)) ? 0 : -EBUSY; | 944 | if (is_free_buddy_page(p)) { |
| 945 | action_result(pfn, "free buddy", DELAYED); | ||
| 946 | return 0; | ||
| 947 | } else { | ||
| 948 | action_result(pfn, "high order kernel", IGNORED); | ||
| 949 | return -EBUSY; | ||
| 950 | } | ||
| 758 | } | 951 | } |
| 759 | 952 | ||
| 760 | /* | 953 | /* |
| @@ -766,14 +959,19 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
| 766 | * walked by the page reclaim code, however that's not a big loss. | 959 | * walked by the page reclaim code, however that's not a big loss. |
| 767 | */ | 960 | */ |
| 768 | if (!PageLRU(p)) | 961 | if (!PageLRU(p)) |
| 769 | lru_add_drain_all(); | 962 | shake_page(p, 0); |
| 770 | lru_flag = p->flags & lru; | 963 | if (!PageLRU(p)) { |
| 771 | if (isolate_lru_page(p)) { | 964 | /* |
| 965 | * shake_page could have turned it free. | ||
| 966 | */ | ||
| 967 | if (is_free_buddy_page(p)) { | ||
| 968 | action_result(pfn, "free buddy, 2nd try", DELAYED); | ||
| 969 | return 0; | ||
| 970 | } | ||
| 772 | action_result(pfn, "non LRU", IGNORED); | 971 | action_result(pfn, "non LRU", IGNORED); |
| 773 | put_page(p); | 972 | put_page(p); |
| 774 | return -EBUSY; | 973 | return -EBUSY; |
| 775 | } | 974 | } |
| 776 | page_cache_release(p); | ||
| 777 | 975 | ||
| 778 | /* | 976 | /* |
| 779 | * Lock the page and wait for writeback to finish. | 977 | * Lock the page and wait for writeback to finish. |
| @@ -781,26 +979,48 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
| 781 | * and in many cases impossible, so we just avoid it here. | 979 | * and in many cases impossible, so we just avoid it here. |
| 782 | */ | 980 | */ |
| 783 | lock_page_nosync(p); | 981 | lock_page_nosync(p); |
| 982 | |||
| 983 | /* | ||
| 984 | * unpoison always clear PG_hwpoison inside page lock | ||
| 985 | */ | ||
| 986 | if (!PageHWPoison(p)) { | ||
| 987 | printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); | ||
| 988 | res = 0; | ||
| 989 | goto out; | ||
| 990 | } | ||
| 991 | if (hwpoison_filter(p)) { | ||
| 992 | if (TestClearPageHWPoison(p)) | ||
| 993 | atomic_long_dec(&mce_bad_pages); | ||
| 994 | unlock_page(p); | ||
| 995 | put_page(p); | ||
| 996 | return 0; | ||
| 997 | } | ||
| 998 | |||
| 784 | wait_on_page_writeback(p); | 999 | wait_on_page_writeback(p); |
| 785 | 1000 | ||
| 786 | /* | 1001 | /* |
| 787 | * Now take care of user space mappings. | 1002 | * Now take care of user space mappings. |
| 1003 | * Abort on fail: __remove_from_page_cache() assumes unmapped page. | ||
| 788 | */ | 1004 | */ |
| 789 | hwpoison_user_mappings(p, pfn, trapno); | 1005 | if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { |
| 1006 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); | ||
| 1007 | res = -EBUSY; | ||
| 1008 | goto out; | ||
| 1009 | } | ||
| 790 | 1010 | ||
| 791 | /* | 1011 | /* |
| 792 | * Torn down by someone else? | 1012 | * Torn down by someone else? |
| 793 | */ | 1013 | */ |
| 794 | if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { | 1014 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { |
| 795 | action_result(pfn, "already truncated LRU", IGNORED); | 1015 | action_result(pfn, "already truncated LRU", IGNORED); |
| 796 | res = 0; | 1016 | res = -EBUSY; |
| 797 | goto out; | 1017 | goto out; |
| 798 | } | 1018 | } |
| 799 | 1019 | ||
| 800 | res = -EBUSY; | 1020 | res = -EBUSY; |
| 801 | for (ps = error_states;; ps++) { | 1021 | for (ps = error_states;; ps++) { |
| 802 | if (((p->flags | lru_flag)& ps->mask) == ps->res) { | 1022 | if ((p->flags & ps->mask) == ps->res) { |
| 803 | res = page_action(ps, p, pfn, ref); | 1023 | res = page_action(ps, p, pfn); |
| 804 | break; | 1024 | break; |
| 805 | } | 1025 | } |
| 806 | } | 1026 | } |
| @@ -831,3 +1051,235 @@ void memory_failure(unsigned long pfn, int trapno) | |||
| 831 | { | 1051 | { |
| 832 | __memory_failure(pfn, trapno, 0); | 1052 | __memory_failure(pfn, trapno, 0); |
| 833 | } | 1053 | } |
| 1054 | |||
| 1055 | /** | ||
| 1056 | * unpoison_memory - Unpoison a previously poisoned page | ||
| 1057 | * @pfn: Page number of the to be unpoisoned page | ||
| 1058 | * | ||
| 1059 | * Software-unpoison a page that has been poisoned by | ||
| 1060 | * memory_failure() earlier. | ||
| 1061 | * | ||
| 1062 | * This is only done on the software-level, so it only works | ||
| 1063 | * for linux injected failures, not real hardware failures | ||
| 1064 | * | ||
| 1065 | * Returns 0 for success, otherwise -errno. | ||
| 1066 | */ | ||
| 1067 | int unpoison_memory(unsigned long pfn) | ||
| 1068 | { | ||
| 1069 | struct page *page; | ||
| 1070 | struct page *p; | ||
| 1071 | int freeit = 0; | ||
| 1072 | |||
| 1073 | if (!pfn_valid(pfn)) | ||
| 1074 | return -ENXIO; | ||
| 1075 | |||
| 1076 | p = pfn_to_page(pfn); | ||
| 1077 | page = compound_head(p); | ||
| 1078 | |||
| 1079 | if (!PageHWPoison(p)) { | ||
| 1080 | pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); | ||
| 1081 | return 0; | ||
| 1082 | } | ||
| 1083 | |||
| 1084 | if (!get_page_unless_zero(page)) { | ||
| 1085 | if (TestClearPageHWPoison(p)) | ||
| 1086 | atomic_long_dec(&mce_bad_pages); | ||
| 1087 | pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); | ||
| 1088 | return 0; | ||
| 1089 | } | ||
| 1090 | |||
| 1091 | lock_page_nosync(page); | ||
| 1092 | /* | ||
| 1093 | * This test is racy because PG_hwpoison is set outside of page lock. | ||
| 1094 | * That's acceptable because that won't trigger kernel panic. Instead, | ||
| 1095 | * the PG_hwpoison page will be caught and isolated on the entrance to | ||
| 1096 | * the free buddy page pool. | ||
| 1097 | */ | ||
| 1098 | if (TestClearPageHWPoison(p)) { | ||
| 1099 | pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); | ||
| 1100 | atomic_long_dec(&mce_bad_pages); | ||
| 1101 | freeit = 1; | ||
| 1102 | } | ||
| 1103 | unlock_page(page); | ||
| 1104 | |||
| 1105 | put_page(page); | ||
| 1106 | if (freeit) | ||
| 1107 | put_page(page); | ||
| 1108 | |||
| 1109 | return 0; | ||
| 1110 | } | ||
| 1111 | EXPORT_SYMBOL(unpoison_memory); | ||
| 1112 | |||
| 1113 | static struct page *new_page(struct page *p, unsigned long private, int **x) | ||
| 1114 | { | ||
| 1115 | int nid = page_to_nid(p); | ||
| 1116 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | ||
| 1117 | } | ||
| 1118 | |||
| 1119 | /* | ||
| 1120 | * Safely get reference count of an arbitrary page. | ||
| 1121 | * Returns 0 for a free page, -EIO for a zero refcount page | ||
| 1122 | * that is not free, and 1 for any other page type. | ||
| 1123 | * For 1 the page is returned with increased page count, otherwise not. | ||
| 1124 | */ | ||
| 1125 | static int get_any_page(struct page *p, unsigned long pfn, int flags) | ||
| 1126 | { | ||
| 1127 | int ret; | ||
| 1128 | |||
| 1129 | if (flags & MF_COUNT_INCREASED) | ||
| 1130 | return 1; | ||
| 1131 | |||
| 1132 | /* | ||
| 1133 | * The lock_system_sleep prevents a race with memory hotplug, | ||
| 1134 | * because the isolation assumes there's only a single user. | ||
| 1135 | * This is a big hammer, a better would be nicer. | ||
| 1136 | */ | ||
| 1137 | lock_system_sleep(); | ||
| 1138 | |||
| 1139 | /* | ||
| 1140 | * Isolate the page, so that it doesn't get reallocated if it | ||
| 1141 | * was free. | ||
| 1142 | */ | ||
| 1143 | set_migratetype_isolate(p); | ||
| 1144 | if (!get_page_unless_zero(compound_head(p))) { | ||
| 1145 | if (is_free_buddy_page(p)) { | ||
| 1146 | pr_debug("get_any_page: %#lx free buddy page\n", pfn); | ||
| 1147 | /* Set hwpoison bit while page is still isolated */ | ||
| 1148 | SetPageHWPoison(p); | ||
| 1149 | ret = 0; | ||
| 1150 | } else { | ||
| 1151 | pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", | ||
| 1152 | pfn, p->flags); | ||
| 1153 | ret = -EIO; | ||
| 1154 | } | ||
| 1155 | } else { | ||
| 1156 | /* Not a free page */ | ||
| 1157 | ret = 1; | ||
| 1158 | } | ||
| 1159 | unset_migratetype_isolate(p); | ||
| 1160 | unlock_system_sleep(); | ||
| 1161 | return ret; | ||
| 1162 | } | ||
| 1163 | |||
| 1164 | /** | ||
| 1165 | * soft_offline_page - Soft offline a page. | ||
| 1166 | * @page: page to offline | ||
| 1167 | * @flags: flags. Same as memory_failure(). | ||
| 1168 | * | ||
| 1169 | * Returns 0 on success, otherwise negated errno. | ||
| 1170 | * | ||
| 1171 | * Soft offline a page, by migration or invalidation, | ||
| 1172 | * without killing anything. This is for the case when | ||
| 1173 | * a page is not corrupted yet (so it's still valid to access), | ||
| 1174 | * but has had a number of corrected errors and is better taken | ||
| 1175 | * out. | ||
| 1176 | * | ||
| 1177 | * The actual policy on when to do that is maintained by | ||
| 1178 | * user space. | ||
| 1179 | * | ||
| 1180 | * This should never impact any application or cause data loss, | ||
| 1181 | * however it might take some time. | ||
| 1182 | * | ||
| 1183 | * This is not a 100% solution for all memory, but tries to be | ||
| 1184 | * ``good enough'' for the majority of memory. | ||
| 1185 | */ | ||
| 1186 | int soft_offline_page(struct page *page, int flags) | ||
| 1187 | { | ||
| 1188 | int ret; | ||
| 1189 | unsigned long pfn = page_to_pfn(page); | ||
| 1190 | |||
| 1191 | ret = get_any_page(page, pfn, flags); | ||
| 1192 | if (ret < 0) | ||
| 1193 | return ret; | ||
| 1194 | if (ret == 0) | ||
| 1195 | goto done; | ||
| 1196 | |||
| 1197 | /* | ||
| 1198 | * Page cache page we can handle? | ||
| 1199 | */ | ||
| 1200 | if (!PageLRU(page)) { | ||
| 1201 | /* | ||
| 1202 | * Try to free it. | ||
| 1203 | */ | ||
| 1204 | put_page(page); | ||
| 1205 | shake_page(page, 1); | ||
| 1206 | |||
| 1207 | /* | ||
| 1208 | * Did it turn free? | ||
| 1209 | */ | ||
| 1210 | ret = get_any_page(page, pfn, 0); | ||
| 1211 | if (ret < 0) | ||
| 1212 | return ret; | ||
| 1213 | if (ret == 0) | ||
| 1214 | goto done; | ||
| 1215 | } | ||
| 1216 | if (!PageLRU(page)) { | ||
| 1217 | pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", | ||
| 1218 | pfn, page->flags); | ||
| 1219 | return -EIO; | ||
| 1220 | } | ||
| 1221 | |||
| 1222 | lock_page(page); | ||
| 1223 | wait_on_page_writeback(page); | ||
| 1224 | |||
| 1225 | /* | ||
| 1226 | * Synchronized using the page lock with memory_failure() | ||
| 1227 | */ | ||
| 1228 | if (PageHWPoison(page)) { | ||
| 1229 | unlock_page(page); | ||
| 1230 | put_page(page); | ||
| 1231 | pr_debug("soft offline: %#lx page already poisoned\n", pfn); | ||
| 1232 | return -EBUSY; | ||
| 1233 | } | ||
| 1234 | |||
| 1235 | /* | ||
| 1236 | * Try to invalidate first. This should work for | ||
| 1237 | * non dirty unmapped page cache pages. | ||
| 1238 | */ | ||
| 1239 | ret = invalidate_inode_page(page); | ||
| 1240 | unlock_page(page); | ||
| 1241 | |||
| 1242 | /* | ||
| 1243 | * Drop count because page migration doesn't like raised | ||
| 1244 | * counts. The page could get re-allocated, but if it becomes | ||
| 1245 | * LRU the isolation will just fail. | ||
| 1246 | * RED-PEN would be better to keep it isolated here, but we | ||
| 1247 | * would need to fix isolation locking first. | ||
| 1248 | */ | ||
| 1249 | put_page(page); | ||
| 1250 | if (ret == 1) { | ||
| 1251 | ret = 0; | ||
| 1252 | pr_debug("soft_offline: %#lx: invalidated\n", pfn); | ||
| 1253 | goto done; | ||
| 1254 | } | ||
| 1255 | |||
| 1256 | /* | ||
| 1257 | * Simple invalidation didn't work. | ||
| 1258 | * Try to migrate to a new page instead. migrate.c | ||
| 1259 | * handles a large number of cases for us. | ||
| 1260 | */ | ||
| 1261 | ret = isolate_lru_page(page); | ||
| 1262 | if (!ret) { | ||
| 1263 | LIST_HEAD(pagelist); | ||
| 1264 | |||
| 1265 | list_add(&page->lru, &pagelist); | ||
| 1266 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | ||
| 1267 | if (ret) { | ||
| 1268 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | ||
| 1269 | pfn, ret, page->flags); | ||
| 1270 | if (ret > 0) | ||
| 1271 | ret = -EIO; | ||
| 1272 | } | ||
| 1273 | } else { | ||
| 1274 | pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | ||
| 1275 | pfn, ret, page_count(page), page->flags); | ||
| 1276 | } | ||
| 1277 | if (ret) | ||
| 1278 | return ret; | ||
| 1279 | |||
| 1280 | done: | ||
| 1281 | atomic_long_add(1, &mce_bad_pages); | ||
| 1282 | SetPageHWPoison(page); | ||
| 1283 | /* keep elevated page count for bad page */ | ||
| 1284 | return ret; | ||
| 1285 | } | ||
diff --git a/mm/memory.c b/mm/memory.c index aed45eaf8ac9..09e4b1be7b67 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -2555,6 +2555,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2555 | ret = VM_FAULT_MAJOR; | 2555 | ret = VM_FAULT_MAJOR; |
| 2556 | count_vm_event(PGMAJFAULT); | 2556 | count_vm_event(PGMAJFAULT); |
| 2557 | } else if (PageHWPoison(page)) { | 2557 | } else if (PageHWPoison(page)) { |
| 2558 | /* | ||
| 2559 | * hwpoisoned dirty swapcache pages are kept for killing | ||
| 2560 | * owner processes (which may be unknown at hwpoison time) | ||
| 2561 | */ | ||
| 2558 | ret = VM_FAULT_HWPOISON; | 2562 | ret = VM_FAULT_HWPOISON; |
| 2559 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2563 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
| 2560 | goto out_release; | 2564 | goto out_release; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 850c4a7e2fe5..74af449b1f1d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -5091,3 +5091,24 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
| 5091 | spin_unlock_irqrestore(&zone->lock, flags); | 5091 | spin_unlock_irqrestore(&zone->lock, flags); |
| 5092 | } | 5092 | } |
| 5093 | #endif | 5093 | #endif |
| 5094 | |||
| 5095 | #ifdef CONFIG_MEMORY_FAILURE | ||
| 5096 | bool is_free_buddy_page(struct page *page) | ||
| 5097 | { | ||
| 5098 | struct zone *zone = page_zone(page); | ||
| 5099 | unsigned long pfn = page_to_pfn(page); | ||
| 5100 | unsigned long flags; | ||
| 5101 | int order; | ||
| 5102 | |||
| 5103 | spin_lock_irqsave(&zone->lock, flags); | ||
| 5104 | for (order = 0; order < MAX_ORDER; order++) { | ||
| 5105 | struct page *page_head = page - (pfn & ((1 << order) - 1)); | ||
| 5106 | |||
| 5107 | if (PageBuddy(page_head) && page_order(page_head) >= order) | ||
| 5108 | break; | ||
| 5109 | } | ||
| 5110 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 5111 | |||
| 5112 | return order < MAX_ORDER; | ||
| 5113 | } | ||
| 5114 | #endif | ||
