aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndi Kleen <andi@firstfloor.org>2009-12-16 06:20:00 -0500
committerAndi Kleen <ak@linux.intel.com>2009-12-16 06:20:00 -0500
commitfacb6011f3993947283fa15d039dacb4ad140230 (patch)
treec317e401fa7c867e1652879627163331f43085ef
parent2326c467df4ff814dc07cf1bdaa1e6e0a9c9f21c (diff)
HWPOISON: Add soft page offline support
This is a simpler, gentler variant of memory_failure() for soft page offlining controlled from user space. It doesn't kill anything, just tries to invalidate and if that doesn't work migrate the page away. This is useful for predictive failure analysis, where a page has a high rate of corrected errors, but hasn't gone bad yet. Instead it can be offlined early and avoided. The offlining is controlled from sysfs, including a new generic entry point for hard page offlining for symmetry too. We use the page isolate facility to prevent re-allocation race. Normally this is only used by memory hotplug. To avoid races with memory allocation I am using lock_system_sleep(). This avoids the situation where memory hotplug is about to isolate a page range and then hwpoison undoes that work. This is a big hammer currently, but the simplest solution currently. When the page is not free or LRU we try to free pages from slab and other caches. The slab freeing is currently quite dumb and does not try to focus on the specific slab cache which might own the page. This could be potentially improved later. Thanks to Fengguang Wu and Haicheng Li for some fixes. [Added fix from Andrew Morton to adapt to new migrate_pages prototype] Signed-off-by: Andi Kleen <ak@linux.intel.com>
-rw-r--r--Documentation/ABI/testing/sysfs-memory-page-offline44
-rw-r--r--drivers/base/memory.c61
-rw-r--r--include/linux/mm.h3
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/memory-failure.c194
5 files changed, 297 insertions, 7 deletions
diff --git a/Documentation/ABI/testing/sysfs-memory-page-offline b/Documentation/ABI/testing/sysfs-memory-page-offline
new file mode 100644
index 000000000000..e14703f12fdf
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-memory-page-offline
@@ -0,0 +1,44 @@
1What: /sys/devices/system/memory/soft_offline_page
2Date: Sep 2009
3KernelVersion: 2.6.33
4Contact: andi@firstfloor.org
5Description:
6 Soft-offline the memory page containing the physical address
7 written into this file. Input is a hex number specifying the
8 physical address of the page. The kernel will then attempt
9 to soft-offline it, by moving the contents elsewhere or
10 dropping it if possible. The kernel will then be placed
11 on the bad page list and never be reused.
12
13 The offlining is done in kernel specific granuality.
14 Normally it's the base page size of the kernel, but
15 this might change.
16
17 The page must be still accessible, not poisoned. The
18 kernel will never kill anything for this, but rather
19 fail the offline. Return value is the size of the
20 number, or a error when the offlining failed. Reading
21 the file is not allowed.
22
23What: /sys/devices/system/memory/hard_offline_page
24Date: Sep 2009
25KernelVersion: 2.6.33
26Contact: andi@firstfloor.org
27Description:
28 Hard-offline the memory page containing the physical
29 address written into this file. Input is a hex number
30 specifying the physical address of the page. The
31 kernel will then attempt to hard-offline the page, by
32 trying to drop the page or killing any owner or
33 triggering IO errors if needed. Note this may kill
34 any processes owning the page. The kernel will avoid
35 to access this page assuming it's poisoned by the
36 hardware.
37
38 The offlining is done in kernel specific granuality.
39 Normally it's the base page size of the kernel, but
40 this might change.
41
42 Return value is the size of the number, or a error when
43 the offlining failed.
44 Reading the file is not allowed.
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 989429cfed88..c4c8f2e1dd15 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -341,6 +341,64 @@ static inline int memory_probe_init(void)
341} 341}
342#endif 342#endif
343 343
344#ifdef CONFIG_MEMORY_FAILURE
345/*
346 * Support for offlining pages of memory
347 */
348
349/* Soft offline a page */
350static ssize_t
351store_soft_offline_page(struct class *class, const char *buf, size_t count)
352{
353 int ret;
354 u64 pfn;
355 if (!capable(CAP_SYS_ADMIN))
356 return -EPERM;
357 if (strict_strtoull(buf, 0, &pfn) < 0)
358 return -EINVAL;
359 pfn >>= PAGE_SHIFT;
360 if (!pfn_valid(pfn))
361 return -ENXIO;
362 ret = soft_offline_page(pfn_to_page(pfn), 0);
363 return ret == 0 ? count : ret;
364}
365
366/* Forcibly offline a page, including killing processes. */
367static ssize_t
368store_hard_offline_page(struct class *class, const char *buf, size_t count)
369{
370 int ret;
371 u64 pfn;
372 if (!capable(CAP_SYS_ADMIN))
373 return -EPERM;
374 if (strict_strtoull(buf, 0, &pfn) < 0)
375 return -EINVAL;
376 pfn >>= PAGE_SHIFT;
377 ret = __memory_failure(pfn, 0, 0);
378 return ret ? ret : count;
379}
380
381static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
382static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
383
384static __init int memory_fail_init(void)
385{
386 int err;
387
388 err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
389 &class_attr_soft_offline_page.attr);
390 if (!err)
391 err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
392 &class_attr_hard_offline_page.attr);
393 return err;
394}
395#else
396static inline int memory_fail_init(void)
397{
398 return 0;
399}
400#endif
401
344/* 402/*
345 * Note that phys_device is optional. It is here to allow for 403 * Note that phys_device is optional. It is here to allow for
346 * differentiation between which *physical* devices each 404 * differentiation between which *physical* devices each
@@ -473,6 +531,9 @@ int __init memory_dev_init(void)
473 err = memory_probe_init(); 531 err = memory_probe_init();
474 if (!ret) 532 if (!ret)
475 ret = err; 533 ret = err;
534 err = memory_fail_init();
535 if (!ret)
536 ret = err;
476 err = block_size_init(); 537 err = block_size_init();
477 if (!ret) 538 if (!ret)
478 ret = err; 539 ret = err;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8cdb941fc7b5..849b4a61bd8f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1339,8 +1339,9 @@ extern int __memory_failure(unsigned long pfn, int trapno, int flags);
1339extern int unpoison_memory(unsigned long pfn); 1339extern int unpoison_memory(unsigned long pfn);
1340extern int sysctl_memory_failure_early_kill; 1340extern int sysctl_memory_failure_early_kill;
1341extern int sysctl_memory_failure_recovery; 1341extern int sysctl_memory_failure_recovery;
1342extern void shake_page(struct page *p); 1342extern void shake_page(struct page *p, int access);
1343extern atomic_long_t mce_bad_pages; 1343extern atomic_long_t mce_bad_pages;
1344extern int soft_offline_page(struct page *page, int flags);
1344 1345
1345#endif /* __KERNEL__ */ 1346#endif /* __KERNEL__ */
1346#endif /* _LINUX_MM_H */ 1347#endif /* _LINUX_MM_H */
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index c597f46ac18a..a77fe3f9e211 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -29,7 +29,7 @@ static int hwpoison_inject(void *data, u64 val)
29 return 0; 29 return 0;
30 30
31 if (!PageLRU(p)) 31 if (!PageLRU(p))
32 shake_page(p); 32 shake_page(p, 0);
33 /* 33 /*
34 * This implies unable to support non-LRU pages. 34 * This implies unable to support non-LRU pages.
35 */ 35 */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b5c3b6bd511f..bcce28755832 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -41,6 +41,9 @@
41#include <linux/pagemap.h> 41#include <linux/pagemap.h>
42#include <linux/swap.h> 42#include <linux/swap.h>
43#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/migrate.h>
45#include <linux/page-isolation.h>
46#include <linux/suspend.h>
44#include "internal.h" 47#include "internal.h"
45 48
46int sysctl_memory_failure_early_kill __read_mostly = 0; 49int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -201,7 +204,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
201 * When a unknown page type is encountered drain as many buffers as possible 204 * When a unknown page type is encountered drain as many buffers as possible
202 * in the hope to turn the page into a LRU or free page, which we can handle. 205 * in the hope to turn the page into a LRU or free page, which we can handle.
203 */ 206 */
204void shake_page(struct page *p) 207void shake_page(struct page *p, int access)
205{ 208{
206 if (!PageSlab(p)) { 209 if (!PageSlab(p)) {
207 lru_add_drain_all(); 210 lru_add_drain_all();
@@ -211,11 +214,19 @@ void shake_page(struct page *p)
211 if (PageLRU(p) || is_free_buddy_page(p)) 214 if (PageLRU(p) || is_free_buddy_page(p))
212 return; 215 return;
213 } 216 }
217
214 /* 218 /*
215 * Could call shrink_slab here (which would also 219 * Only all shrink_slab here (which would also
216 * shrink other caches). Unfortunately that might 220 * shrink other caches) if access is not potentially fatal.
217 * also access the corrupted page, which could be fatal.
218 */ 221 */
222 if (access) {
223 int nr;
224 do {
225 nr = shrink_slab(1000, GFP_KERNEL, 1000);
226 if (page_count(p) == 0)
227 break;
228 } while (nr > 10);
229 }
219} 230}
220EXPORT_SYMBOL_GPL(shake_page); 231EXPORT_SYMBOL_GPL(shake_page);
221 232
@@ -949,7 +960,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
949 * walked by the page reclaim code, however that's not a big loss. 960 * walked by the page reclaim code, however that's not a big loss.
950 */ 961 */
951 if (!PageLRU(p)) 962 if (!PageLRU(p))
952 shake_page(p); 963 shake_page(p, 0);
953 if (!PageLRU(p)) { 964 if (!PageLRU(p)) {
954 /* 965 /*
955 * shake_page could have turned it free. 966 * shake_page could have turned it free.
@@ -1099,3 +1110,176 @@ int unpoison_memory(unsigned long pfn)
1099 return 0; 1110 return 0;
1100} 1111}
1101EXPORT_SYMBOL(unpoison_memory); 1112EXPORT_SYMBOL(unpoison_memory);
1113
1114static struct page *new_page(struct page *p, unsigned long private, int **x)
1115{
1116 return alloc_pages(GFP_HIGHUSER_MOVABLE, 0);
1117}
1118
1119/*
1120 * Safely get reference count of an arbitrary page.
1121 * Returns 0 for a free page, -EIO for a zero refcount page
1122 * that is not free, and 1 for any other page type.
1123 * For 1 the page is returned with increased page count, otherwise not.
1124 */
1125static int get_any_page(struct page *p, unsigned long pfn, int flags)
1126{
1127 int ret;
1128
1129 if (flags & MF_COUNT_INCREASED)
1130 return 1;
1131
1132 /*
1133 * The lock_system_sleep prevents a race with memory hotplug,
1134 * because the isolation assumes there's only a single user.
1135 * This is a big hammer, a better would be nicer.
1136 */
1137 lock_system_sleep();
1138
1139 /*
1140 * Isolate the page, so that it doesn't get reallocated if it
1141 * was free.
1142 */
1143 set_migratetype_isolate(p);
1144 if (!get_page_unless_zero(compound_head(p))) {
1145 if (is_free_buddy_page(p)) {
1146 pr_debug("get_any_page: %#lx free buddy page\n", pfn);
1147 /* Set hwpoison bit while page is still isolated */
1148 SetPageHWPoison(p);
1149 ret = 0;
1150 } else {
1151 pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1152 pfn, p->flags);
1153 ret = -EIO;
1154 }
1155 } else {
1156 /* Not a free page */
1157 ret = 1;
1158 }
1159 unset_migratetype_isolate(p);
1160 unlock_system_sleep();
1161 return ret;
1162}
1163
1164/**
1165 * soft_offline_page - Soft offline a page.
1166 * @page: page to offline
1167 * @flags: flags. Same as memory_failure().
1168 *
1169 * Returns 0 on success, otherwise negated errno.
1170 *
1171 * Soft offline a page, by migration or invalidation,
1172 * without killing anything. This is for the case when
1173 * a page is not corrupted yet (so it's still valid to access),
1174 * but has had a number of corrected errors and is better taken
1175 * out.
1176 *
1177 * The actual policy on when to do that is maintained by
1178 * user space.
1179 *
1180 * This should never impact any application or cause data loss,
1181 * however it might take some time.
1182 *
1183 * This is not a 100% solution for all memory, but tries to be
1184 * ``good enough'' for the majority of memory.
1185 */
1186int soft_offline_page(struct page *page, int flags)
1187{
1188 int ret;
1189 unsigned long pfn = page_to_pfn(page);
1190
1191 ret = get_any_page(page, pfn, flags);
1192 if (ret < 0)
1193 return ret;
1194 if (ret == 0)
1195 goto done;
1196
1197 /*
1198 * Page cache page we can handle?
1199 */
1200 if (!PageLRU(page)) {
1201 /*
1202 * Try to free it.
1203 */
1204 put_page(page);
1205 shake_page(page, 1);
1206
1207 /*
1208 * Did it turn free?
1209 */
1210 ret = get_any_page(page, pfn, 0);
1211 if (ret < 0)
1212 return ret;
1213 if (ret == 0)
1214 goto done;
1215 }
1216 if (!PageLRU(page)) {
1217 pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
1218 pfn, page->flags);
1219 return -EIO;
1220 }
1221
1222 lock_page(page);
1223 wait_on_page_writeback(page);
1224
1225 /*
1226 * Synchronized using the page lock with memory_failure()
1227 */
1228 if (PageHWPoison(page)) {
1229 unlock_page(page);
1230 put_page(page);
1231 pr_debug("soft offline: %#lx page already poisoned\n", pfn);
1232 return -EBUSY;
1233 }
1234
1235 /*
1236 * Try to invalidate first. This should work for
1237 * non dirty unmapped page cache pages.
1238 */
1239 ret = invalidate_inode_page(page);
1240 unlock_page(page);
1241
1242 /*
1243 * Drop count because page migration doesn't like raised
1244 * counts. The page could get re-allocated, but if it becomes
1245 * LRU the isolation will just fail.
1246 * RED-PEN would be better to keep it isolated here, but we
1247 * would need to fix isolation locking first.
1248 */
1249 put_page(page);
1250 if (ret == 1) {
1251 ret = 0;
1252 pr_debug("soft_offline: %#lx: invalidated\n", pfn);
1253 goto done;
1254 }
1255
1256 /*
1257 * Simple invalidation didn't work.
1258 * Try to migrate to a new page instead. migrate.c
1259 * handles a large number of cases for us.
1260 */
1261 ret = isolate_lru_page(page);
1262 if (!ret) {
1263 LIST_HEAD(pagelist);
1264
1265 list_add(&page->lru, &pagelist);
1266 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1267 if (ret) {
1268 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1269 pfn, ret, page->flags);
1270 if (ret > 0)
1271 ret = -EIO;
1272 }
1273 } else {
1274 pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1275 pfn, ret, page_count(page), page->flags);
1276 }
1277 if (ret)
1278 return ret;
1279
1280done:
1281 atomic_long_add(1, &mce_bad_pages);
1282 SetPageHWPoison(page);
1283 /* keep elevated page count for bad page */
1284 return ret;
1285}