HWPOISON: Add soft page offline support

This is a simpler, gentler variant of memory_failure() for soft page offlining controlled from user space. It doesn't kill anything, just tries to invalidate and if that doesn't work migrate the page away. This is useful for predictive failure analysis, where a page has a high rate of corrected errors, but hasn't gone bad yet. Instead it can be offlined early and avoided. The offlining is controlled from sysfs, including a new generic entry point for hard page offlining for symmetry too. We use the page isolate facility to prevent re-allocation race. Normally this is only used by memory hotplug. To avoid races with memory allocation I am using lock_system_sleep(). This avoids the situation where memory hotplug is about to isolate a page range and then hwpoison undoes that work. This is a big hammer currently, but the simplest solution currently. When the page is not free or LRU we try to free pages from slab and other caches. The slab freeing is currently quite dumb and does not try to focus on the specific slab cache which might own the page. This could be potentially improved later. Thanks to Fengguang Wu and Haicheng Li for some fixes. [Added fix from Andrew Morton to adapt to new migrate_pages prototype] Signed-off-by: Andi Kleen <ak@linux.intel.com>
author: Andi Kleen <andi@firstfloor.org> 2009-12-16 06:20:00 -0500
committer: Andi Kleen <ak@linux.intel.com> 2009-12-16 06:20:00 -0500
commit: facb6011f3993947283fa15d039dacb4ad140230 (patch)
tree: c317e401fa7c867e1652879627163331f43085ef /mm
parent: 2326c467df4ff814dc07cf1bdaa1e6e0a9c9f21c (diff)
2 files changed, 190 insertions, 6 deletions
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index c597f46ac18a..a77fe3f9e211 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -29,7 +29,7 @@ static int hwpoison_inject(void *data, u64 val)
                return 0;
        if (!PageLRU(p))
-                shake_page(p);
+                shake_page(p, 0);
        /*
         * This implies unable to support non-LRU pages.
         */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b5c3b6bd511f..bcce28755832 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -41,6 +41,9 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/backing-dev.h>
+#include <linux/migrate.h>
+#include <linux/page-isolation.h>
+#include <linux/suspend.h>
 #include "internal.h"
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -201,7 +204,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
 * When a unknown page type is encountered drain as many buffers as possible
 * in the hope to turn the page into a LRU or free page, which we can handle.
 */
-void shake_page(struct page *p)
+void shake_page(struct page *p, int access)
 {
        if (!PageSlab(p)) {
                lru_add_drain_all();
@@ -211,11 +214,19 @@ void shake_page(struct page *p)
                if (PageLRU(p) || is_free_buddy_page(p))
                        return;
        }
        /*
-         * Could call shrink_slab here (which would also
+         * Only all shrink_slab here (which would also
-         * shrink other caches). Unfortunately that might
+         * shrink other caches) if access is not potentially fatal.
-         * also access the corrupted page, which could be fatal.
         */
+        if (access) {
+                int nr;
+                do {
+                        nr = shrink_slab(1000, GFP_KERNEL, 1000);
+                        if (page_count(p) == 0)
+                                break;
+                } while (nr > 10);
+        }
 }
 EXPORT_SYMBOL_GPL(shake_page);
@@ -949,7 +960,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * walked by the page reclaim code, however that's not a big loss.
         */
        if (!PageLRU(p))
-                shake_page(p);
+                shake_page(p, 0);
        if (!PageLRU(p)) {
                /*
                 * shake_page could have turned it free.
@@ -1099,3 +1110,176 @@ int unpoison_memory(unsigned long pfn)
        return 0;
 }
 EXPORT_SYMBOL(unpoison_memory);
+static struct page *new_page(struct page *p, unsigned long private, int **x)
+{
+        return alloc_pages(GFP_HIGHUSER_MOVABLE, 0);
+}
+/*
+ * Safely get reference count of an arbitrary page.
+ * Returns 0 for a free page, -EIO for a zero refcount page
+ * that is not free, and 1 for any other page type.
+ * For 1 the page is returned with increased page count, otherwise not.
+ */
+static int get_any_page(struct page *p, unsigned long pfn, int flags)
+{
+        int ret;
+        if (flags & MF_COUNT_INCREASED)
+                return 1;
+        /*
+         * The lock_system_sleep prevents a race with memory hotplug,
+         * because the isolation assumes there's only a single user.
+         * This is a big hammer, a better would be nicer.
+         */
+        lock_system_sleep();
+        /*
+         * Isolate the page, so that it doesn't get reallocated if it
+         * was free.
+         */
+        set_migratetype_isolate(p);
+        if (!get_page_unless_zero(compound_head(p))) {
+                if (is_free_buddy_page(p)) {
+                        pr_debug("get_any_page: %#lx free buddy page\n", pfn);
+                        /* Set hwpoison bit while page is still isolated */
+                        SetPageHWPoison(p);
+                        ret = 0;
+                } else {
+                        pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
+                                pfn, p->flags);
+                        ret = -EIO;
+                }
+        } else {
+                /* Not a free page */
+                ret = 1;
+        }
+        unset_migratetype_isolate(p);
+        unlock_system_sleep();
+        return ret;
+}
+/**
+ * soft_offline_page - Soft offline a page.
+ * @page: page to offline
+ * @flags: flags. Same as memory_failure().
+ *
+ * Returns 0 on success, otherwise negated errno.
+ *
+ * Soft offline a page, by migration or invalidation,
+ * without killing anything. This is for the case when
+ * a page is not corrupted yet (so it's still valid to access),
+ * but has had a number of corrected errors and is better taken
+ * out.
+ *
+ * The actual policy on when to do that is maintained by
+ * user space.
+ *
+ * This should never impact any application or cause data loss,
+ * however it might take some time.
+ *
+ * This is not a 100% solution for all memory, but tries to be
+ * ``good enough'' for the majority of memory.
+ */
+int soft_offline_page(struct page *page, int flags)
+{
+        int ret;
+        unsigned long pfn = page_to_pfn(page);
+        ret = get_any_page(page, pfn, flags);
+        if (ret < 0)
+                return ret;
+        if (ret == 0)
+                goto done;
+        /*
+         * Page cache page we can handle?
+         */
+        if (!PageLRU(page)) {
+                /*
+                 * Try to free it.
+                 */
+                put_page(page);
+                shake_page(page, 1);
+                /*
+                 * Did it turn free?
+                 */
+                ret = get_any_page(page, pfn, 0);
+                if (ret < 0)
+                        return ret;
+                if (ret == 0)
+                        goto done;
+        }
+        if (!PageLRU(page)) {
+                pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
+                                pfn, page->flags);
+                return -EIO;
+        }
+        lock_page(page);
+        wait_on_page_writeback(page);
+        /*
+         * Synchronized using the page lock with memory_failure()
+         */
+        if (PageHWPoison(page)) {
+                unlock_page(page);
+                put_page(page);
+                pr_debug("soft offline: %#lx page already poisoned\n", pfn);
+                return -EBUSY;
+        }
+        /*
+         * Try to invalidate first. This should work for
+         * non dirty unmapped page cache pages.
+         */
+        ret = invalidate_inode_page(page);
+        unlock_page(page);
+        /*
+         * Drop count because page migration doesn't like raised
+         * counts. The page could get re-allocated, but if it becomes
+         * LRU the isolation will just fail.
+         * RED-PEN would be better to keep it isolated here, but we
+         * would need to fix isolation locking first.
+         */
+        put_page(page);
+        if (ret == 1) {
+                ret = 0;
+                pr_debug("soft_offline: %#lx: invalidated\n", pfn);
+                goto done;
+        }
+        /*
+         * Simple invalidation didn't work.
+         * Try to migrate to a new page instead. migrate.c
+         * handles a large number of cases for us.
+         */
+        ret = isolate_lru_page(page);
+        if (!ret) {
+                LIST_HEAD(pagelist);
+                list_add(&page->lru, &pagelist);
+                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+                if (ret) {
+                        pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+                                pfn, ret, page->flags);
+                        if (ret > 0)
+                                ret = -EIO;
+                }
+        } else {
+                pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
+                                pfn, ret, page_count(page), page->flags);
+        }
+        if (ret)
+                return ret;
+done:
+        atomic_long_add(1, &mce_bad_pages);
+        SetPageHWPoison(page);
+        /* keep elevated page count for bad page */
+        return ret;
+}
author	Andi Kleen <andi@firstfloor.org>	2009-12-16 06:20:00 -0500
committer	Andi Kleen <ak@linux.intel.com>	2009-12-16 06:20:00 -0500
commit	facb6011f3993947283fa15d039dacb4ad140230 (patch)
tree	c317e401fa7c867e1652879627163331f43085ef /mm
parent	2326c467df4ff814dc07cf1bdaa1e6e0a9c9f21c (diff)

diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index c597f46ac18a..a77fe3f9e211 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c
@@ -29,7 +29,7 @@ static int hwpoison_inject(void *data, u64 val)
29	return 0;	29	return 0;
30		30
31	if (!PageLRU(p))	31	if (!PageLRU(p))
32	shake_page(p);	32	shake_page(p, 0);
33	/*	33	/*
34	* This implies unable to support non-LRU pages.	34	* This implies unable to support non-LRU pages.
35	*/	35	*/


diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b5c3b6bd511f..bcce28755832 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c
@@ -41,6 +41,9 @@
41	#include <linux/pagemap.h>	41	#include <linux/pagemap.h>
42	#include <linux/swap.h>	42	#include <linux/swap.h>
43	#include <linux/backing-dev.h>	43	#include <linux/backing-dev.h>
		44	#include <linux/migrate.h>
		45	#include <linux/page-isolation.h>
		46	#include <linux/suspend.h>
44	#include "internal.h"	47	#include "internal.h"
45		48
46	int sysctl_memory_failure_early_kill __read_mostly = 0;	49	int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -201,7 +204,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
201	* When a unknown page type is encountered drain as many buffers as possible	204	* When a unknown page type is encountered drain as many buffers as possible
202	* in the hope to turn the page into a LRU or free page, which we can handle.	205	* in the hope to turn the page into a LRU or free page, which we can handle.
203	*/	206	*/
204	void shake_page(struct page *p)	207	void shake_page(struct page *p, int access)
205	{	208	{
206	if (!PageSlab(p)) {	209	if (!PageSlab(p)) {
207	lru_add_drain_all();	210	lru_add_drain_all();
@@ -211,11 +214,19 @@ void shake_page(struct page *p)
211	if (PageLRU(p) \|\| is_free_buddy_page(p))	214	if (PageLRU(p) \|\| is_free_buddy_page(p))
212	return;	215	return;
213	}	216	}
		217
214	/*	218	/*
215	* Could call shrink_slab here (which would also	219	* Only all shrink_slab here (which would also
216	* shrink other caches). Unfortunately that might	220	* shrink other caches) if access is not potentially fatal.
217	* also access the corrupted page, which could be fatal.
218	*/	221	*/
		222	if (access) {
		223	int nr;
		224	do {
		225	nr = shrink_slab(1000, GFP_KERNEL, 1000);
		226	if (page_count(p) == 0)
		227	break;
		228	} while (nr > 10);
		229	}
219	}	230	}
220	EXPORT_SYMBOL_GPL(shake_page);	231	EXPORT_SYMBOL_GPL(shake_page);
221		232
@@ -949,7 +960,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
949	* walked by the page reclaim code, however that's not a big loss.	960	* walked by the page reclaim code, however that's not a big loss.
950	*/	961	*/
951	if (!PageLRU(p))	962	if (!PageLRU(p))
952	shake_page(p);	963	shake_page(p, 0);
953	if (!PageLRU(p)) {	964	if (!PageLRU(p)) {
954	/*	965	/*
955	* shake_page could have turned it free.	966	* shake_page could have turned it free.
@@ -1099,3 +1110,176 @@ int unpoison_memory(unsigned long pfn)
1099	return 0;	1110	return 0;
1100	}	1111	}
1101	EXPORT_SYMBOL(unpoison_memory);	1112	EXPORT_SYMBOL(unpoison_memory);
		1113
		1114	static struct page new_page(struct page p, unsigned long private, int **x)
		1115	{
		1116	return alloc_pages(GFP_HIGHUSER_MOVABLE, 0);
		1117	}
		1118
		1119	/*
		1120	* Safely get reference count of an arbitrary page.
		1121	* Returns 0 for a free page, -EIO for a zero refcount page
		1122	* that is not free, and 1 for any other page type.
		1123	* For 1 the page is returned with increased page count, otherwise not.
		1124	*/
		1125	static int get_any_page(struct page *p, unsigned long pfn, int flags)
		1126	{
		1127	int ret;
		1128
		1129	if (flags & MF_COUNT_INCREASED)
		1130	return 1;
		1131
		1132	/*
		1133	* The lock_system_sleep prevents a race with memory hotplug,
		1134	* because the isolation assumes there's only a single user.
		1135	* This is a big hammer, a better would be nicer.
		1136	*/
		1137	lock_system_sleep();
		1138
		1139	/*
		1140	* Isolate the page, so that it doesn't get reallocated if it
		1141	* was free.
		1142	*/
		1143	set_migratetype_isolate(p);
		1144	if (!get_page_unless_zero(compound_head(p))) {
		1145	if (is_free_buddy_page(p)) {
		1146	pr_debug("get_any_page: %#lx free buddy page\n", pfn);
		1147	/* Set hwpoison bit while page is still isolated */
		1148	SetPageHWPoison(p);
		1149	ret = 0;
		1150	} else {
		1151	pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
		1152	pfn, p->flags);
		1153	ret = -EIO;
		1154	}
		1155	} else {
		1156	/* Not a free page */
		1157	ret = 1;
		1158	}
		1159	unset_migratetype_isolate(p);
		1160	unlock_system_sleep();
		1161	return ret;
		1162	}
		1163
		1164	/**
		1165	* soft_offline_page - Soft offline a page.
		1166	* @page: page to offline
		1167	* @flags: flags. Same as memory_failure().
		1168	*
		1169	* Returns 0 on success, otherwise negated errno.
		1170	*
		1171	* Soft offline a page, by migration or invalidation,
		1172	* without killing anything. This is for the case when
		1173	* a page is not corrupted yet (so it's still valid to access),
		1174	* but has had a number of corrected errors and is better taken
		1175	* out.
		1176	*
		1177	* The actual policy on when to do that is maintained by
		1178	* user space.
		1179	*
		1180	* This should never impact any application or cause data loss,
		1181	* however it might take some time.
		1182	*
		1183	* This is not a 100% solution for all memory, but tries to be
		1184	* ``good enough'' for the majority of memory.
		1185	*/
		1186	int soft_offline_page(struct page *page, int flags)
		1187	{
		1188	int ret;
		1189	unsigned long pfn = page_to_pfn(page);
		1190
		1191	ret = get_any_page(page, pfn, flags);
		1192	if (ret < 0)
		1193	return ret;
		1194	if (ret == 0)
		1195	goto done;
		1196
		1197	/*
		1198	* Page cache page we can handle?
		1199	*/
		1200	if (!PageLRU(page)) {
		1201	/*
		1202	* Try to free it.
		1203	*/
		1204	put_page(page);
		1205	shake_page(page, 1);
		1206
		1207	/*
		1208	* Did it turn free?
		1209	*/
		1210	ret = get_any_page(page, pfn, 0);
		1211	if (ret < 0)
		1212	return ret;
		1213	if (ret == 0)
		1214	goto done;
		1215	}
		1216	if (!PageLRU(page)) {
		1217	pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
		1218	pfn, page->flags);
		1219	return -EIO;
		1220	}
		1221
		1222	lock_page(page);
		1223	wait_on_page_writeback(page);
		1224
		1225	/*
		1226	* Synchronized using the page lock with memory_failure()
		1227	*/
		1228	if (PageHWPoison(page)) {
		1229	unlock_page(page);
		1230	put_page(page);
		1231	pr_debug("soft offline: %#lx page already poisoned\n", pfn);
		1232	return -EBUSY;
		1233	}
		1234
		1235	/*
		1236	* Try to invalidate first. This should work for
		1237	* non dirty unmapped page cache pages.
		1238	*/
		1239	ret = invalidate_inode_page(page);
		1240	unlock_page(page);
		1241
		1242	/*
		1243	* Drop count because page migration doesn't like raised
		1244	* counts. The page could get re-allocated, but if it becomes
		1245	* LRU the isolation will just fail.
		1246	* RED-PEN would be better to keep it isolated here, but we
		1247	* would need to fix isolation locking first.
		1248	*/
		1249	put_page(page);
		1250	if (ret == 1) {
		1251	ret = 0;
		1252	pr_debug("soft_offline: %#lx: invalidated\n", pfn);
		1253	goto done;
		1254	}
		1255
		1256	/*
		1257	* Simple invalidation didn't work.
		1258	* Try to migrate to a new page instead. migrate.c
		1259	* handles a large number of cases for us.
		1260	*/
		1261	ret = isolate_lru_page(page);
		1262	if (!ret) {
		1263	LIST_HEAD(pagelist);
		1264
		1265	list_add(&page->lru, &pagelist);
		1266	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
		1267	if (ret) {
		1268	pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
		1269	pfn, ret, page->flags);
		1270	if (ret > 0)
		1271	ret = -EIO;
		1272	}
		1273	} else {
		1274	pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
		1275	pfn, ret, page_count(page), page->flags);
		1276	}
		1277	if (ret)
		1278	return ret;
		1279
		1280	done:
		1281	atomic_long_add(1, &mce_bad_pages);
		1282	SetPageHWPoison(page);
		1283	/* keep elevated page count for bad page */
		1284	return ret;
		1285	}