diff options
author | Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> | 2013-07-10 05:27:01 -0400 |
---|---|---|
committer | Tony Luck <tony.luck@intel.com> | 2013-07-10 14:35:02 -0400 |
commit | cf870c70a194443f8fc654ddc9d6cfd02c58003b (patch) | |
tree | 73553a1960478b454dbcb99c0db0c8acf381e58b | |
parent | 9ad95879cd1b22ed016c804f8d686ff83a41a9d4 (diff) |
mce: acpi/apei: Soft-offline a page on firmware GHES notification
If the firmware indicates in GHES error data entry that the error threshold
has exceeded for a corrected error event, then we try to soft-offline the
page. This could be called in interrupt context, so we queue this up similar
to how we handle memory failure scenarios.
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Tony Luck <tony.luck@intel.com>
-rw-r--r-- | drivers/acpi/apei/ghes.c | 38 | ||||
-rw-r--r-- | include/linux/mm.h | 1 | ||||
-rw-r--r-- | mm/memory-failure.c | 5 |
3 files changed, 34 insertions, 10 deletions
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index fcd7d91cec34..a8f362acc8ec 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c | |||
@@ -409,6 +409,34 @@ static void ghes_clear_estatus(struct ghes *ghes) | |||
409 | ghes->flags &= ~GHES_TO_CLEAR; | 409 | ghes->flags &= ~GHES_TO_CLEAR; |
410 | } | 410 | } |
411 | 411 | ||
412 | static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int sev) | ||
413 | { | ||
414 | #ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE | ||
415 | unsigned long pfn; | ||
416 | int sec_sev = ghes_severity(gdata->error_severity); | ||
417 | struct cper_sec_mem_err *mem_err; | ||
418 | mem_err = (struct cper_sec_mem_err *)(gdata + 1); | ||
419 | |||
420 | if (sec_sev == GHES_SEV_CORRECTED && | ||
421 | (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED) && | ||
422 | (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS)) { | ||
423 | pfn = mem_err->physical_addr >> PAGE_SHIFT; | ||
424 | if (pfn_valid(pfn)) | ||
425 | memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE); | ||
426 | else if (printk_ratelimit()) | ||
427 | pr_warn(FW_WARN GHES_PFX | ||
428 | "Invalid address in generic error data: %#llx\n", | ||
429 | mem_err->physical_addr); | ||
430 | } | ||
431 | if (sev == GHES_SEV_RECOVERABLE && | ||
432 | sec_sev == GHES_SEV_RECOVERABLE && | ||
433 | mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) { | ||
434 | pfn = mem_err->physical_addr >> PAGE_SHIFT; | ||
435 | memory_failure_queue(pfn, 0, 0); | ||
436 | } | ||
437 | #endif | ||
438 | } | ||
439 | |||
412 | static void ghes_do_proc(struct ghes *ghes, | 440 | static void ghes_do_proc(struct ghes *ghes, |
413 | const struct acpi_hest_generic_status *estatus) | 441 | const struct acpi_hest_generic_status *estatus) |
414 | { | 442 | { |
@@ -428,15 +456,7 @@ static void ghes_do_proc(struct ghes *ghes, | |||
428 | apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED, | 456 | apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED, |
429 | mem_err); | 457 | mem_err); |
430 | #endif | 458 | #endif |
431 | #ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE | 459 | ghes_handle_memory_failure(gdata, sev); |
432 | if (sev == GHES_SEV_RECOVERABLE && | ||
433 | sec_sev == GHES_SEV_RECOVERABLE && | ||
434 | mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) { | ||
435 | unsigned long pfn; | ||
436 | pfn = mem_err->physical_addr >> PAGE_SHIFT; | ||
437 | memory_failure_queue(pfn, 0, 0); | ||
438 | } | ||
439 | #endif | ||
440 | } | 460 | } |
441 | #ifdef CONFIG_ACPI_APEI_PCIEAER | 461 | #ifdef CONFIG_ACPI_APEI_PCIEAER |
442 | else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type, | 462 | else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type, |
diff --git a/include/linux/mm.h b/include/linux/mm.h index e0c8528a41a4..958e9efd02a7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -1784,6 +1784,7 @@ enum mf_flags { | |||
1784 | MF_COUNT_INCREASED = 1 << 0, | 1784 | MF_COUNT_INCREASED = 1 << 0, |
1785 | MF_ACTION_REQUIRED = 1 << 1, | 1785 | MF_ACTION_REQUIRED = 1 << 1, |
1786 | MF_MUST_KILL = 1 << 2, | 1786 | MF_MUST_KILL = 1 << 2, |
1787 | MF_SOFT_OFFLINE = 1 << 3, | ||
1787 | }; | 1788 | }; |
1788 | extern int memory_failure(unsigned long pfn, int trapno, int flags); | 1789 | extern int memory_failure(unsigned long pfn, int trapno, int flags); |
1789 | extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); | 1790 | extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ceb0c7f1932f..0d6717e52ea2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1286,7 +1286,10 @@ static void memory_failure_work_func(struct work_struct *work) | |||
1286 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); | 1286 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); |
1287 | if (!gotten) | 1287 | if (!gotten) |
1288 | break; | 1288 | break; |
1289 | memory_failure(entry.pfn, entry.trapno, entry.flags); | 1289 | if (entry.flags & MF_SOFT_OFFLINE) |
1290 | soft_offline_page(pfn_to_page(entry.pfn), entry.flags); | ||
1291 | else | ||
1292 | memory_failure(entry.pfn, entry.trapno, entry.flags); | ||
1290 | } | 1293 | } |
1291 | } | 1294 | } |
1292 | 1295 | ||