summaryrefslogtreecommitdiffstats
path: root/mm/memory-failure.c
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2018-07-14 00:50:21 -0400
committerDave Jiang <dave.jiang@intel.com>2018-07-23 13:38:06 -0400
commit6100e34b2526e1dc3dbcc47fea2677974d6aaea5 (patch)
treeea42b8ea172ea1f8b0e9d1ea3da2198bd8596d08 /mm/memory-failure.c
parentc2a7d2a115525d3501d38e23d24875a79a07e15e (diff)
mm, memory_failure: Teach memory_failure() about dev_pagemap pages
mce: Uncorrected hardware memory error in user-access at af34214200 {1}[Hardware Error]: It has been corrected by h/w and requires no further action mce: [Hardware Error]: Machine check events logged {1}[Hardware Error]: event severity: corrected Memory failure: 0xaf34214: reserved kernel page still referenced by 1 users [..] Memory failure: 0xaf34214: recovery action for reserved kernel page: Failed mce: Memory error not recovered In contrast to typical memory, dev_pagemap pages may be dax mapped. With dax there is no possibility to map in another page dynamically since dax establishes 1:1 physical address to file offset associations. Also dev_pagemap pages associated with NVDIMM / persistent memory devices can internal remap/repair addresses with poison. While memory_failure() assumes that it can discard typical poisoned pages and keep them unmapped indefinitely, dev_pagemap pages may be returned to service after the error is cleared. Teach memory_failure() to detect and handle MEMORY_DEVICE_HOST dev_pagemap pages that have poison consumed by userspace. Mark the memory as UC instead of unmapping it completely to allow ongoing access via the device driver (nd_pmem). Later, nd_pmem will grow support for marking the page back to WB when the error is cleared. Cc: Jan Kara <jack@suse.cz> Cc: Christoph Hellwig <hch@lst.de> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r--mm/memory-failure.c125
1 files changed, 123 insertions, 2 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8a81680d00dd..32a644d9c2ee 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -55,6 +55,7 @@
55#include <linux/hugetlb.h> 55#include <linux/hugetlb.h>
56#include <linux/memory_hotplug.h> 56#include <linux/memory_hotplug.h>
57#include <linux/mm_inline.h> 57#include <linux/mm_inline.h>
58#include <linux/memremap.h>
58#include <linux/kfifo.h> 59#include <linux/kfifo.h>
59#include <linux/ratelimit.h> 60#include <linux/ratelimit.h>
60#include "internal.h" 61#include "internal.h"
@@ -263,6 +264,40 @@ void shake_page(struct page *p, int access)
263} 264}
264EXPORT_SYMBOL_GPL(shake_page); 265EXPORT_SYMBOL_GPL(shake_page);
265 266
267static unsigned long dev_pagemap_mapping_shift(struct page *page,
268 struct vm_area_struct *vma)
269{
270 unsigned long address = vma_address(page, vma);
271 pgd_t *pgd;
272 p4d_t *p4d;
273 pud_t *pud;
274 pmd_t *pmd;
275 pte_t *pte;
276
277 pgd = pgd_offset(vma->vm_mm, address);
278 if (!pgd_present(*pgd))
279 return 0;
280 p4d = p4d_offset(pgd, address);
281 if (!p4d_present(*p4d))
282 return 0;
283 pud = pud_offset(p4d, address);
284 if (!pud_present(*pud))
285 return 0;
286 if (pud_devmap(*pud))
287 return PUD_SHIFT;
288 pmd = pmd_offset(pud, address);
289 if (!pmd_present(*pmd))
290 return 0;
291 if (pmd_devmap(*pmd))
292 return PMD_SHIFT;
293 pte = pte_offset_map(pmd, address);
294 if (!pte_present(*pte))
295 return 0;
296 if (pte_devmap(*pte))
297 return PAGE_SHIFT;
298 return 0;
299}
300
266/* 301/*
267 * Failure handling: if we can't find or can't kill a process there's 302 * Failure handling: if we can't find or can't kill a process there's
268 * not much we can do. We just print a message and ignore otherwise. 303 * not much we can do. We just print a message and ignore otherwise.
@@ -292,7 +327,10 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
292 } 327 }
293 tk->addr = page_address_in_vma(p, vma); 328 tk->addr = page_address_in_vma(p, vma);
294 tk->addr_valid = 1; 329 tk->addr_valid = 1;
295 tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT; 330 if (is_zone_device_page(p))
331 tk->size_shift = dev_pagemap_mapping_shift(p, vma);
332 else
333 tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
296 334
297 /* 335 /*
298 * In theory we don't have to kill when the page was 336 * In theory we don't have to kill when the page was
@@ -300,7 +338,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
300 * likely very rare kill anyways just out of paranoia, but use 338 * likely very rare kill anyways just out of paranoia, but use
301 * a SIGKILL because the error is not contained anymore. 339 * a SIGKILL because the error is not contained anymore.
302 */ 340 */
303 if (tk->addr == -EFAULT) { 341 if (tk->addr == -EFAULT || tk->size_shift == 0) {
304 pr_info("Memory failure: Unable to find user space address %lx in %s\n", 342 pr_info("Memory failure: Unable to find user space address %lx in %s\n",
305 page_to_pfn(p), tsk->comm); 343 page_to_pfn(p), tsk->comm);
306 tk->addr_valid = 0; 344 tk->addr_valid = 0;
@@ -514,6 +552,7 @@ static const char * const action_page_types[] = {
514 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", 552 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
515 [MF_MSG_BUDDY] = "free buddy page", 553 [MF_MSG_BUDDY] = "free buddy page",
516 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", 554 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
555 [MF_MSG_DAX] = "dax page",
517 [MF_MSG_UNKNOWN] = "unknown page", 556 [MF_MSG_UNKNOWN] = "unknown page",
518}; 557};
519 558
@@ -1111,6 +1150,83 @@ out:
1111 return res; 1150 return res;
1112} 1151}
1113 1152
1153static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
1154 struct dev_pagemap *pgmap)
1155{
1156 struct page *page = pfn_to_page(pfn);
1157 const bool unmap_success = true;
1158 unsigned long size = 0;
1159 struct to_kill *tk;
1160 LIST_HEAD(tokill);
1161 int rc = -EBUSY;
1162 loff_t start;
1163
1164 /*
1165 * Prevent the inode from being freed while we are interrogating
1166 * the address_space, typically this would be handled by
1167 * lock_page(), but dax pages do not use the page lock. This
1168 * also prevents changes to the mapping of this pfn until
1169 * poison signaling is complete.
1170 */
1171 if (!dax_lock_mapping_entry(page))
1172 goto out;
1173
1174 if (hwpoison_filter(page)) {
1175 rc = 0;
1176 goto unlock;
1177 }
1178
1179 switch (pgmap->type) {
1180 case MEMORY_DEVICE_PRIVATE:
1181 case MEMORY_DEVICE_PUBLIC:
1182 /*
1183 * TODO: Handle HMM pages which may need coordination
1184 * with device-side memory.
1185 */
1186 goto unlock;
1187 default:
1188 break;
1189 }
1190
1191 /*
1192 * Use this flag as an indication that the dax page has been
1193 * remapped UC to prevent speculative consumption of poison.
1194 */
1195 SetPageHWPoison(page);
1196
1197 /*
1198 * Unlike System-RAM there is no possibility to swap in a
1199 * different physical page at a given virtual address, so all
1200 * userspace consumption of ZONE_DEVICE memory necessitates
1201 * SIGBUS (i.e. MF_MUST_KILL)
1202 */
1203 flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1204 collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
1205
1206 list_for_each_entry(tk, &tokill, nd)
1207 if (tk->size_shift)
1208 size = max(size, 1UL << tk->size_shift);
1209 if (size) {
1210 /*
1211 * Unmap the largest mapping to avoid breaking up
1212 * device-dax mappings which are constant size. The
1213 * actual size of the mapping being torn down is
1214 * communicated in siginfo, see kill_proc()
1215 */
1216 start = (page->index << PAGE_SHIFT) & ~(size - 1);
1217 unmap_mapping_range(page->mapping, start, start + size, 0);
1218 }
1219 kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
1220 rc = 0;
1221unlock:
1222 dax_unlock_mapping_entry(page);
1223out:
1224 /* drop pgmap ref acquired in caller */
1225 put_dev_pagemap(pgmap);
1226 action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
1227 return rc;
1228}
1229
1114/** 1230/**
1115 * memory_failure - Handle memory failure of a page. 1231 * memory_failure - Handle memory failure of a page.
1116 * @pfn: Page Number of the corrupted page 1232 * @pfn: Page Number of the corrupted page
@@ -1133,6 +1249,7 @@ int memory_failure(unsigned long pfn, int flags)
1133 struct page *p; 1249 struct page *p;
1134 struct page *hpage; 1250 struct page *hpage;
1135 struct page *orig_head; 1251 struct page *orig_head;
1252 struct dev_pagemap *pgmap;
1136 int res; 1253 int res;
1137 unsigned long page_flags; 1254 unsigned long page_flags;
1138 1255
@@ -1145,6 +1262,10 @@ int memory_failure(unsigned long pfn, int flags)
1145 return -ENXIO; 1262 return -ENXIO;
1146 } 1263 }
1147 1264
1265 pgmap = get_dev_pagemap(pfn, NULL);
1266 if (pgmap)
1267 return memory_failure_dev_pagemap(pfn, flags, pgmap);
1268
1148 p = pfn_to_page(pfn); 1269 p = pfn_to_page(pfn);
1149 if (PageHuge(p)) 1270 if (PageHuge(p))
1150 return memory_failure_hugetlb(pfn, flags); 1271 return memory_failure_hugetlb(pfn, flags);