diff options
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r-- | mm/memory-failure.c | 175 |
1 files changed, 125 insertions, 50 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 757f6b0accfe..44a8cefeae6e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -7,21 +7,26 @@ | |||
7 | * Free Software Foundation. | 7 | * Free Software Foundation. |
8 | * | 8 | * |
9 | * High level machine check handler. Handles pages reported by the | 9 | * High level machine check handler. Handles pages reported by the |
10 | * hardware as being corrupted usually due to a 2bit ECC memory or cache | 10 | * hardware as being corrupted usually due to a multi-bit ECC memory or cache |
11 | * failure. | 11 | * failure. |
12 | * | ||
13 | * In addition there is a "soft offline" entry point that allows stop using | ||
14 | * not-yet-corrupted-by-suspicious pages without killing anything. | ||
12 | * | 15 | * |
13 | * Handles page cache pages in various states. The tricky part | 16 | * Handles page cache pages in various states. The tricky part |
14 | * here is that we can access any page asynchronous to other VM | 17 | * here is that we can access any page asynchronously in respect to |
15 | * users, because memory failures could happen anytime and anywhere, | 18 | * other VM users, because memory failures could happen anytime and |
16 | * possibly violating some of their assumptions. This is why this code | 19 | * anywhere. This could violate some of their assumptions. This is why |
17 | * has to be extremely careful. Generally it tries to use normal locking | 20 | * this code has to be extremely careful. Generally it tries to use |
18 | * rules, as in get the standard locks, even if that means the | 21 | * normal locking rules, as in get the standard locks, even if that means |
19 | * error handling takes potentially a long time. | 22 | * the error handling takes potentially a long time. |
20 | * | 23 | * |
21 | * The operation to map back from RMAP chains to processes has to walk | 24 | * There are several operations here with exponential complexity because |
22 | * the complete process list and has non linear complexity with the number | 25 | * of unsuitable VM data structures. For example the operation to map back |
23 | * mappings. In short it can be quite slow. But since memory corruptions | 26 | * from RMAP chains to processes has to walk the complete process list and |
24 | * are rare we hope to get away with this. | 27 | * has non linear complexity with the number. But since memory corruptions |
28 | * are rare we hope to get away with this. This avoids impacting the core | ||
29 | * VM. | ||
25 | */ | 30 | */ |
26 | 31 | ||
27 | /* | 32 | /* |
@@ -30,7 +35,6 @@ | |||
30 | * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages | 35 | * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages |
31 | * - pass bad pages to kdump next kernel | 36 | * - pass bad pages to kdump next kernel |
32 | */ | 37 | */ |
33 | #define DEBUG 1 /* remove me in 2.6.34 */ | ||
34 | #include <linux/kernel.h> | 38 | #include <linux/kernel.h> |
35 | #include <linux/mm.h> | 39 | #include <linux/mm.h> |
36 | #include <linux/page-flags.h> | 40 | #include <linux/page-flags.h> |
@@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p) | |||
78 | return 0; | 82 | return 0; |
79 | 83 | ||
80 | /* | 84 | /* |
81 | * page_mapping() does not accept slab page | 85 | * page_mapping() does not accept slab pages. |
82 | */ | 86 | */ |
83 | if (PageSlab(p)) | 87 | if (PageSlab(p)) |
84 | return -EINVAL; | 88 | return -EINVAL; |
@@ -268,7 +272,7 @@ struct to_kill { | |||
268 | struct list_head nd; | 272 | struct list_head nd; |
269 | struct task_struct *tsk; | 273 | struct task_struct *tsk; |
270 | unsigned long addr; | 274 | unsigned long addr; |
271 | unsigned addr_valid:1; | 275 | char addr_valid; |
272 | }; | 276 | }; |
273 | 277 | ||
274 | /* | 278 | /* |
@@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, | |||
309 | * a SIGKILL because the error is not contained anymore. | 313 | * a SIGKILL because the error is not contained anymore. |
310 | */ | 314 | */ |
311 | if (tk->addr == -EFAULT) { | 315 | if (tk->addr == -EFAULT) { |
312 | pr_debug("MCE: Unable to find user space address %lx in %s\n", | 316 | pr_info("MCE: Unable to find user space address %lx in %s\n", |
313 | page_to_pfn(p), tsk->comm); | 317 | page_to_pfn(p), tsk->comm); |
314 | tk->addr_valid = 0; | 318 | tk->addr_valid = 0; |
315 | } | 319 | } |
@@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
577 | pfn, err); | 581 | pfn, err); |
578 | } else if (page_has_private(p) && | 582 | } else if (page_has_private(p) && |
579 | !try_to_release_page(p, GFP_NOIO)) { | 583 | !try_to_release_page(p, GFP_NOIO)) { |
580 | pr_debug("MCE %#lx: failed to release buffers\n", pfn); | 584 | pr_info("MCE %#lx: failed to release buffers\n", pfn); |
581 | } else { | 585 | } else { |
582 | ret = RECOVERED; | 586 | ret = RECOVERED; |
583 | } | 587 | } |
@@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) | |||
693 | * Issues: | 697 | * Issues: |
694 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) | 698 | * - Error on hugepage is contained in hugepage unit (not in raw page unit.) |
695 | * To narrow down kill region to one page, we need to break up pmd. | 699 | * To narrow down kill region to one page, we need to break up pmd. |
696 | * - To support soft-offlining for hugepage, we need to support hugepage | ||
697 | * migration. | ||
698 | */ | 700 | */ |
699 | static int me_huge_page(struct page *p, unsigned long pfn) | 701 | static int me_huge_page(struct page *p, unsigned long pfn) |
700 | { | 702 | { |
703 | int res = 0; | ||
701 | struct page *hpage = compound_head(p); | 704 | struct page *hpage = compound_head(p); |
702 | /* | 705 | /* |
703 | * We can safely recover from error on free or reserved (i.e. | 706 | * We can safely recover from error on free or reserved (i.e. |
@@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
710 | * so there is no race between isolation and mapping/unmapping. | 713 | * so there is no race between isolation and mapping/unmapping. |
711 | */ | 714 | */ |
712 | if (!(page_mapping(hpage) || PageAnon(hpage))) { | 715 | if (!(page_mapping(hpage) || PageAnon(hpage))) { |
713 | __isolate_hwpoisoned_huge_page(hpage); | 716 | res = dequeue_hwpoisoned_huge_page(hpage); |
714 | return RECOVERED; | 717 | if (!res) |
718 | return RECOVERED; | ||
715 | } | 719 | } |
716 | return DELAYED; | 720 | return DELAYED; |
717 | } | 721 | } |
@@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p, | |||
836 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; | 840 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; |
837 | } | 841 | } |
838 | 842 | ||
839 | #define N_UNMAP_TRIES 5 | ||
840 | |||
841 | /* | 843 | /* |
842 | * Do all that is necessary to remove user space mappings. Unmap | 844 | * Do all that is necessary to remove user space mappings. Unmap |
843 | * the pages and send SIGBUS to the processes if the data was dirty. | 845 | * the pages and send SIGBUS to the processes if the data was dirty. |
@@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
849 | struct address_space *mapping; | 851 | struct address_space *mapping; |
850 | LIST_HEAD(tokill); | 852 | LIST_HEAD(tokill); |
851 | int ret; | 853 | int ret; |
852 | int i; | ||
853 | int kill = 1; | 854 | int kill = 1; |
854 | struct page *hpage = compound_head(p); | 855 | struct page *hpage = compound_head(p); |
855 | 856 | ||
@@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
903 | if (kill) | 904 | if (kill) |
904 | collect_procs(hpage, &tokill); | 905 | collect_procs(hpage, &tokill); |
905 | 906 | ||
906 | /* | 907 | ret = try_to_unmap(hpage, ttu); |
907 | * try_to_unmap can fail temporarily due to races. | ||
908 | * Try a few times (RED-PEN better strategy?) | ||
909 | */ | ||
910 | for (i = 0; i < N_UNMAP_TRIES; i++) { | ||
911 | ret = try_to_unmap(hpage, ttu); | ||
912 | if (ret == SWAP_SUCCESS) | ||
913 | break; | ||
914 | pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); | ||
915 | } | ||
916 | |||
917 | if (ret != SWAP_SUCCESS) | 908 | if (ret != SWAP_SUCCESS) |
918 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | 909 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", |
919 | pfn, page_mapcount(hpage)); | 910 | pfn, page_mapcount(hpage)); |
@@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
981 | * We need/can do nothing about count=0 pages. | 972 | * We need/can do nothing about count=0 pages. |
982 | * 1) it's a free page, and therefore in safe hand: | 973 | * 1) it's a free page, and therefore in safe hand: |
983 | * prep_new_page() will be the gate keeper. | 974 | * prep_new_page() will be the gate keeper. |
984 | * 2) it's part of a non-compound high order page. | 975 | * 2) it's a free hugepage, which is also safe: |
976 | * an affected hugepage will be dequeued from hugepage freelist, | ||
977 | * so there's no concern about reusing it ever after. | ||
978 | * 3) it's part of a non-compound high order page. | ||
985 | * Implies some kernel user: cannot stop them from | 979 | * Implies some kernel user: cannot stop them from |
986 | * R/W the page; let's pray that the page has been | 980 | * R/W the page; let's pray that the page has been |
987 | * used and will be freed some time later. | 981 | * used and will be freed some time later. |
@@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
993 | if (is_free_buddy_page(p)) { | 987 | if (is_free_buddy_page(p)) { |
994 | action_result(pfn, "free buddy", DELAYED); | 988 | action_result(pfn, "free buddy", DELAYED); |
995 | return 0; | 989 | return 0; |
990 | } else if (PageHuge(hpage)) { | ||
991 | /* | ||
992 | * Check "just unpoisoned", "filter hit", and | ||
993 | * "race with other subpage." | ||
994 | */ | ||
995 | lock_page_nosync(hpage); | ||
996 | if (!PageHWPoison(hpage) | ||
997 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) | ||
998 | || (p != hpage && TestSetPageHWPoison(hpage))) { | ||
999 | atomic_long_sub(nr_pages, &mce_bad_pages); | ||
1000 | return 0; | ||
1001 | } | ||
1002 | set_page_hwpoison_huge_page(hpage); | ||
1003 | res = dequeue_hwpoisoned_huge_page(hpage); | ||
1004 | action_result(pfn, "free huge", | ||
1005 | res ? IGNORED : DELAYED); | ||
1006 | unlock_page(hpage); | ||
1007 | return res; | ||
996 | } else { | 1008 | } else { |
997 | action_result(pfn, "high order kernel", IGNORED); | 1009 | action_result(pfn, "high order kernel", IGNORED); |
998 | return -EBUSY; | 1010 | return -EBUSY; |
@@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn) | |||
1147 | page = compound_head(p); | 1159 | page = compound_head(p); |
1148 | 1160 | ||
1149 | if (!PageHWPoison(p)) { | 1161 | if (!PageHWPoison(p)) { |
1150 | pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); | 1162 | pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); |
1151 | return 0; | 1163 | return 0; |
1152 | } | 1164 | } |
1153 | 1165 | ||
1154 | nr_pages = 1 << compound_order(page); | 1166 | nr_pages = 1 << compound_order(page); |
1155 | 1167 | ||
1156 | if (!get_page_unless_zero(page)) { | 1168 | if (!get_page_unless_zero(page)) { |
1169 | /* | ||
1170 | * Since HWPoisoned hugepage should have non-zero refcount, | ||
1171 | * race between memory failure and unpoison seems to happen. | ||
1172 | * In such case unpoison fails and memory failure runs | ||
1173 | * to the end. | ||
1174 | */ | ||
1175 | if (PageHuge(page)) { | ||
1176 | pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); | ||
1177 | return 0; | ||
1178 | } | ||
1157 | if (TestClearPageHWPoison(p)) | 1179 | if (TestClearPageHWPoison(p)) |
1158 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1180 | atomic_long_sub(nr_pages, &mce_bad_pages); |
1159 | pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); | 1181 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); |
1160 | return 0; | 1182 | return 0; |
1161 | } | 1183 | } |
1162 | 1184 | ||
@@ -1168,12 +1190,12 @@ int unpoison_memory(unsigned long pfn) | |||
1168 | * the free buddy page pool. | 1190 | * the free buddy page pool. |
1169 | */ | 1191 | */ |
1170 | if (TestClearPageHWPoison(page)) { | 1192 | if (TestClearPageHWPoison(page)) { |
1171 | pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); | 1193 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); |
1172 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1194 | atomic_long_sub(nr_pages, &mce_bad_pages); |
1173 | freeit = 1; | 1195 | freeit = 1; |
1196 | if (PageHuge(page)) | ||
1197 | clear_page_hwpoison_huge_page(page); | ||
1174 | } | 1198 | } |
1175 | if (PageHuge(p)) | ||
1176 | clear_page_hwpoison_huge_page(page); | ||
1177 | unlock_page(page); | 1199 | unlock_page(page); |
1178 | 1200 | ||
1179 | put_page(page); | 1201 | put_page(page); |
@@ -1187,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory); | |||
1187 | static struct page *new_page(struct page *p, unsigned long private, int **x) | 1209 | static struct page *new_page(struct page *p, unsigned long private, int **x) |
1188 | { | 1210 | { |
1189 | int nid = page_to_nid(p); | 1211 | int nid = page_to_nid(p); |
1190 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | 1212 | if (PageHuge(p)) |
1213 | return alloc_huge_page_node(page_hstate(compound_head(p)), | ||
1214 | nid); | ||
1215 | else | ||
1216 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | ||
1191 | } | 1217 | } |
1192 | 1218 | ||
1193 | /* | 1219 | /* |
@@ -1215,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1215 | * was free. | 1241 | * was free. |
1216 | */ | 1242 | */ |
1217 | set_migratetype_isolate(p); | 1243 | set_migratetype_isolate(p); |
1244 | /* | ||
1245 | * When the target page is a free hugepage, just remove it | ||
1246 | * from free hugepage list. | ||
1247 | */ | ||
1218 | if (!get_page_unless_zero(compound_head(p))) { | 1248 | if (!get_page_unless_zero(compound_head(p))) { |
1219 | if (is_free_buddy_page(p)) { | 1249 | if (PageHuge(p)) { |
1220 | pr_debug("get_any_page: %#lx free buddy page\n", pfn); | 1250 | pr_info("get_any_page: %#lx free huge page\n", pfn); |
1251 | ret = dequeue_hwpoisoned_huge_page(compound_head(p)); | ||
1252 | } else if (is_free_buddy_page(p)) { | ||
1253 | pr_info("get_any_page: %#lx free buddy page\n", pfn); | ||
1221 | /* Set hwpoison bit while page is still isolated */ | 1254 | /* Set hwpoison bit while page is still isolated */ |
1222 | SetPageHWPoison(p); | 1255 | SetPageHWPoison(p); |
1223 | ret = 0; | 1256 | ret = 0; |
1224 | } else { | 1257 | } else { |
1225 | pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", | 1258 | pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", |
1226 | pfn, p->flags); | 1259 | pfn, p->flags); |
1227 | ret = -EIO; | 1260 | ret = -EIO; |
1228 | } | 1261 | } |
@@ -1235,6 +1268,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1235 | return ret; | 1268 | return ret; |
1236 | } | 1269 | } |
1237 | 1270 | ||
1271 | static int soft_offline_huge_page(struct page *page, int flags) | ||
1272 | { | ||
1273 | int ret; | ||
1274 | unsigned long pfn = page_to_pfn(page); | ||
1275 | struct page *hpage = compound_head(page); | ||
1276 | LIST_HEAD(pagelist); | ||
1277 | |||
1278 | ret = get_any_page(page, pfn, flags); | ||
1279 | if (ret < 0) | ||
1280 | return ret; | ||
1281 | if (ret == 0) | ||
1282 | goto done; | ||
1283 | |||
1284 | if (PageHWPoison(hpage)) { | ||
1285 | put_page(hpage); | ||
1286 | pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); | ||
1287 | return -EBUSY; | ||
1288 | } | ||
1289 | |||
1290 | /* Keep page count to indicate a given hugepage is isolated. */ | ||
1291 | |||
1292 | list_add(&hpage->lru, &pagelist); | ||
1293 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | ||
1294 | if (ret) { | ||
1295 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | ||
1296 | pfn, ret, page->flags); | ||
1297 | if (ret > 0) | ||
1298 | ret = -EIO; | ||
1299 | return ret; | ||
1300 | } | ||
1301 | done: | ||
1302 | if (!PageHWPoison(hpage)) | ||
1303 | atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); | ||
1304 | set_page_hwpoison_huge_page(hpage); | ||
1305 | dequeue_hwpoisoned_huge_page(hpage); | ||
1306 | /* keep elevated page count for bad page */ | ||
1307 | return ret; | ||
1308 | } | ||
1309 | |||
1238 | /** | 1310 | /** |
1239 | * soft_offline_page - Soft offline a page. | 1311 | * soft_offline_page - Soft offline a page. |
1240 | * @page: page to offline | 1312 | * @page: page to offline |
@@ -1262,6 +1334,9 @@ int soft_offline_page(struct page *page, int flags) | |||
1262 | int ret; | 1334 | int ret; |
1263 | unsigned long pfn = page_to_pfn(page); | 1335 | unsigned long pfn = page_to_pfn(page); |
1264 | 1336 | ||
1337 | if (PageHuge(page)) | ||
1338 | return soft_offline_huge_page(page, flags); | ||
1339 | |||
1265 | ret = get_any_page(page, pfn, flags); | 1340 | ret = get_any_page(page, pfn, flags); |
1266 | if (ret < 0) | 1341 | if (ret < 0) |
1267 | return ret; | 1342 | return ret; |
@@ -1288,7 +1363,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1288 | goto done; | 1363 | goto done; |
1289 | } | 1364 | } |
1290 | if (!PageLRU(page)) { | 1365 | if (!PageLRU(page)) { |
1291 | pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", | 1366 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", |
1292 | pfn, page->flags); | 1367 | pfn, page->flags); |
1293 | return -EIO; | 1368 | return -EIO; |
1294 | } | 1369 | } |
@@ -1302,7 +1377,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1302 | if (PageHWPoison(page)) { | 1377 | if (PageHWPoison(page)) { |
1303 | unlock_page(page); | 1378 | unlock_page(page); |
1304 | put_page(page); | 1379 | put_page(page); |
1305 | pr_debug("soft offline: %#lx page already poisoned\n", pfn); | 1380 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
1306 | return -EBUSY; | 1381 | return -EBUSY; |
1307 | } | 1382 | } |
1308 | 1383 | ||
@@ -1323,7 +1398,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1323 | put_page(page); | 1398 | put_page(page); |
1324 | if (ret == 1) { | 1399 | if (ret == 1) { |
1325 | ret = 0; | 1400 | ret = 0; |
1326 | pr_debug("soft_offline: %#lx: invalidated\n", pfn); | 1401 | pr_info("soft_offline: %#lx: invalidated\n", pfn); |
1327 | goto done; | 1402 | goto done; |
1328 | } | 1403 | } |
1329 | 1404 | ||
@@ -1339,13 +1414,13 @@ int soft_offline_page(struct page *page, int flags) | |||
1339 | list_add(&page->lru, &pagelist); | 1414 | list_add(&page->lru, &pagelist); |
1340 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | 1415 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); |
1341 | if (ret) { | 1416 | if (ret) { |
1342 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | 1417 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1343 | pfn, ret, page->flags); | 1418 | pfn, ret, page->flags); |
1344 | if (ret > 0) | 1419 | if (ret > 0) |
1345 | ret = -EIO; | 1420 | ret = -EIO; |
1346 | } | 1421 | } |
1347 | } else { | 1422 | } else { |
1348 | pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | 1423 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", |
1349 | pfn, ret, page_count(page), page->flags); | 1424 | pfn, ret, page_count(page), page->flags); |
1350 | } | 1425 | } |
1351 | if (ret) | 1426 | if (ret) |