diff options
| -rw-r--r-- | arch/x86/Kconfig | 2 | ||||
| -rw-r--r-- | arch/x86/kernel/acpi/Makefile | 1 | ||||
| -rw-r--r-- | arch/x86/kernel/acpi/apei.c | 62 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 9 | ||||
| -rw-r--r-- | drivers/Kconfig | 2 | ||||
| -rw-r--r-- | drivers/Makefile | 1 | ||||
| -rw-r--r-- | drivers/acpi/Kconfig | 4 | ||||
| -rw-r--r-- | drivers/acpi/acpi_extlog.c | 46 | ||||
| -rw-r--r-- | drivers/acpi/apei/Kconfig | 8 | ||||
| -rw-r--r-- | drivers/acpi/apei/apei-base.c | 13 | ||||
| -rw-r--r-- | drivers/acpi/apei/ghes.c | 173 | ||||
| -rw-r--r-- | drivers/acpi/apei/hest.c | 29 | ||||
| -rw-r--r-- | drivers/edac/Kconfig | 1 | ||||
| -rw-r--r-- | drivers/edac/edac_mc.c | 3 | ||||
| -rw-r--r-- | drivers/firmware/efi/cper.c | 192 | ||||
| -rw-r--r-- | drivers/pci/pcie/aer/Kconfig | 1 | ||||
| -rw-r--r-- | drivers/pci/pcie/aer/aerdrv_errprint.c | 4 | ||||
| -rw-r--r-- | drivers/ras/Kconfig | 2 | ||||
| -rw-r--r-- | drivers/ras/Makefile | 1 | ||||
| -rw-r--r-- | drivers/ras/debugfs.c | 56 | ||||
| -rw-r--r-- | drivers/ras/ras.c | 29 | ||||
| -rw-r--r-- | include/acpi/apei.h | 4 | ||||
| -rw-r--r-- | include/linux/aer.h | 2 | ||||
| -rw-r--r-- | include/linux/cper.h | 32 | ||||
| -rw-r--r-- | include/linux/nmi.h | 4 | ||||
| -rw-r--r-- | include/linux/ras.h | 14 | ||||
| -rw-r--r-- | include/ras/ras_event.h | 128 | ||||
| -rw-r--r-- | include/trace/events/ras.h | 77 |
28 files changed, 653 insertions, 247 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 86216a55eb59..6b71f0417293 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
| @@ -131,6 +131,8 @@ config X86 | |||
| 131 | select GENERIC_CPU_AUTOPROBE | 131 | select GENERIC_CPU_AUTOPROBE |
| 132 | select HAVE_ARCH_AUDITSYSCALL | 132 | select HAVE_ARCH_AUDITSYSCALL |
| 133 | select ARCH_SUPPORTS_ATOMIC_RMW | 133 | select ARCH_SUPPORTS_ATOMIC_RMW |
| 134 | select HAVE_ACPI_APEI if ACPI | ||
| 135 | select HAVE_ACPI_APEI_NMI if ACPI | ||
| 134 | 136 | ||
| 135 | config INSTRUCTION_DECODER | 137 | config INSTRUCTION_DECODER |
| 136 | def_bool y | 138 | def_bool y |
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 163b22581472..3242e591fa82 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | obj-$(CONFIG_ACPI) += boot.o | 1 | obj-$(CONFIG_ACPI) += boot.o |
| 2 | obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o | 2 | obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o |
| 3 | obj-$(CONFIG_ACPI_APEI) += apei.o | ||
| 3 | 4 | ||
| 4 | ifneq ($(CONFIG_ACPI_PROCESSOR),) | 5 | ifneq ($(CONFIG_ACPI_PROCESSOR),) |
| 5 | obj-y += cstate.o | 6 | obj-y += cstate.o |
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c new file mode 100644 index 000000000000..c280df6b2aa2 --- /dev/null +++ b/arch/x86/kernel/acpi/apei.c | |||
| @@ -0,0 +1,62 @@ | |||
| 1 | /* | ||
| 2 | * Arch-specific APEI-related functions. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | */ | ||
| 14 | |||
| 15 | #include <acpi/apei.h> | ||
| 16 | |||
| 17 | #include <asm/mce.h> | ||
| 18 | #include <asm/tlbflush.h> | ||
| 19 | |||
| 20 | int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) | ||
| 21 | { | ||
| 22 | #ifdef CONFIG_X86_MCE | ||
| 23 | int i; | ||
| 24 | struct acpi_hest_ia_corrected *cmc; | ||
| 25 | struct acpi_hest_ia_error_bank *mc_bank; | ||
| 26 | |||
| 27 | if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) | ||
| 28 | return 0; | ||
| 29 | |||
| 30 | cmc = (struct acpi_hest_ia_corrected *)hest_hdr; | ||
| 31 | if (!cmc->enabled) | ||
| 32 | return 0; | ||
| 33 | |||
| 34 | /* | ||
| 35 | * We expect HEST to provide a list of MC banks that report errors | ||
| 36 | * in firmware first mode. Otherwise, return non-zero value to | ||
| 37 | * indicate that we are done parsing HEST. | ||
| 38 | */ | ||
| 39 | if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || | ||
| 40 | !cmc->num_hardware_banks) | ||
| 41 | return 1; | ||
| 42 | |||
| 43 | pr_info("HEST: Enabling Firmware First mode for corrected errors.\n"); | ||
| 44 | |||
| 45 | mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1); | ||
| 46 | for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++) | ||
| 47 | mce_disable_bank(mc_bank->bank_number); | ||
| 48 | #endif | ||
| 49 | return 1; | ||
| 50 | } | ||
| 51 | |||
| 52 | void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) | ||
| 53 | { | ||
| 54 | #ifdef CONFIG_X86_MCE | ||
| 55 | apei_mce_report_mem_error(sev, mem_err); | ||
| 56 | #endif | ||
| 57 | } | ||
| 58 | |||
| 59 | void arch_apei_flush_tlb_one(unsigned long addr) | ||
| 60 | { | ||
| 61 | __flush_tlb_one(addr); | ||
| 62 | } | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9a79c8dbd8e8..4fc57975acc1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
| @@ -2385,6 +2385,10 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 2385 | threshold_cpu_callback(action, cpu); | 2385 | threshold_cpu_callback(action, cpu); |
| 2386 | mce_device_remove(cpu); | 2386 | mce_device_remove(cpu); |
| 2387 | mce_intel_hcpu_update(cpu); | 2387 | mce_intel_hcpu_update(cpu); |
| 2388 | |||
| 2389 | /* intentionally ignoring frozen here */ | ||
| 2390 | if (!(action & CPU_TASKS_FROZEN)) | ||
| 2391 | cmci_rediscover(); | ||
| 2388 | break; | 2392 | break; |
| 2389 | case CPU_DOWN_PREPARE: | 2393 | case CPU_DOWN_PREPARE: |
| 2390 | smp_call_function_single(cpu, mce_disable_cpu, &action, 1); | 2394 | smp_call_function_single(cpu, mce_disable_cpu, &action, 1); |
| @@ -2396,11 +2400,6 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 2396 | break; | 2400 | break; |
| 2397 | } | 2401 | } |
| 2398 | 2402 | ||
| 2399 | if (action == CPU_POST_DEAD) { | ||
| 2400 | /* intentionally ignoring frozen here */ | ||
| 2401 | cmci_rediscover(); | ||
| 2402 | } | ||
| 2403 | |||
| 2404 | return NOTIFY_OK; | 2403 | return NOTIFY_OK; |
| 2405 | } | 2404 | } |
| 2406 | 2405 | ||
diff --git a/drivers/Kconfig b/drivers/Kconfig index 0e87a34b6472..4e6e66c3c8d6 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig | |||
| @@ -176,4 +176,6 @@ source "drivers/powercap/Kconfig" | |||
| 176 | 176 | ||
| 177 | source "drivers/mcb/Kconfig" | 177 | source "drivers/mcb/Kconfig" |
| 178 | 178 | ||
| 179 | source "drivers/ras/Kconfig" | ||
| 180 | |||
| 179 | endmenu | 181 | endmenu |
diff --git a/drivers/Makefile b/drivers/Makefile index f98b50d8251d..65c32b1cea3d 100644 --- a/drivers/Makefile +++ b/drivers/Makefile | |||
| @@ -158,3 +158,4 @@ obj-$(CONFIG_NTB) += ntb/ | |||
| 158 | obj-$(CONFIG_FMC) += fmc/ | 158 | obj-$(CONFIG_FMC) += fmc/ |
| 159 | obj-$(CONFIG_POWERCAP) += powercap/ | 159 | obj-$(CONFIG_POWERCAP) += powercap/ |
| 160 | obj-$(CONFIG_MCB) += mcb/ | 160 | obj-$(CONFIG_MCB) += mcb/ |
| 161 | obj-$(CONFIG_RAS) += ras/ | ||
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index a34a22841002..206942b8d105 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig | |||
| @@ -370,6 +370,7 @@ config ACPI_EXTLOG | |||
| 370 | tristate "Extended Error Log support" | 370 | tristate "Extended Error Log support" |
| 371 | depends on X86_MCE && X86_LOCAL_APIC | 371 | depends on X86_MCE && X86_LOCAL_APIC |
| 372 | select UEFI_CPER | 372 | select UEFI_CPER |
| 373 | select RAS | ||
| 373 | default n | 374 | default n |
| 374 | help | 375 | help |
| 375 | Certain usages such as Predictive Failure Analysis (PFA) require | 376 | Certain usages such as Predictive Failure Analysis (PFA) require |
| @@ -384,6 +385,7 @@ config ACPI_EXTLOG | |||
| 384 | 385 | ||
| 385 | Enhanced MCA Logging allows firmware to provide additional error | 386 | Enhanced MCA Logging allows firmware to provide additional error |
| 386 | information to system software, synchronous with MCE or CMCI. This | 387 | information to system software, synchronous with MCE or CMCI. This |
| 387 | driver adds support for that functionality. | 388 | driver adds support for that functionality with corresponding |
| 389 | tracepoint which carries that information to userspace. | ||
| 388 | 390 | ||
| 389 | endif # ACPI | 391 | endif # ACPI |
diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index 185334114d71..0ad6f389d922 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c | |||
| @@ -12,10 +12,12 @@ | |||
| 12 | #include <linux/cper.h> | 12 | #include <linux/cper.h> |
| 13 | #include <linux/ratelimit.h> | 13 | #include <linux/ratelimit.h> |
| 14 | #include <linux/edac.h> | 14 | #include <linux/edac.h> |
| 15 | #include <linux/ras.h> | ||
| 15 | #include <asm/cpu.h> | 16 | #include <asm/cpu.h> |
| 16 | #include <asm/mce.h> | 17 | #include <asm/mce.h> |
| 17 | 18 | ||
| 18 | #include "apei/apei-internal.h" | 19 | #include "apei/apei-internal.h" |
| 20 | #include <ras/ras_event.h> | ||
| 19 | 21 | ||
| 20 | #define EXT_ELOG_ENTRY_MASK GENMASK_ULL(51, 0) /* elog entry address mask */ | 22 | #define EXT_ELOG_ENTRY_MASK GENMASK_ULL(51, 0) /* elog entry address mask */ |
| 21 | 23 | ||
| @@ -137,8 +139,12 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, | |||
| 137 | struct mce *mce = (struct mce *)data; | 139 | struct mce *mce = (struct mce *)data; |
| 138 | int bank = mce->bank; | 140 | int bank = mce->bank; |
| 139 | int cpu = mce->extcpu; | 141 | int cpu = mce->extcpu; |
| 140 | struct acpi_generic_status *estatus; | 142 | struct acpi_generic_status *estatus, *tmp; |
| 141 | int rc; | 143 | struct acpi_generic_data *gdata; |
| 144 | const uuid_le *fru_id = &NULL_UUID_LE; | ||
| 145 | char *fru_text = ""; | ||
| 146 | uuid_le *sec_type; | ||
| 147 | static u32 err_seq; | ||
| 142 | 148 | ||
| 143 | estatus = extlog_elog_entry_check(cpu, bank); | 149 | estatus = extlog_elog_entry_check(cpu, bank); |
| 144 | if (estatus == NULL) | 150 | if (estatus == NULL) |
| @@ -148,8 +154,29 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, | |||
| 148 | /* clear record status to enable BIOS to update it again */ | 154 | /* clear record status to enable BIOS to update it again */ |
| 149 | estatus->block_status = 0; | 155 | estatus->block_status = 0; |
| 150 | 156 | ||
| 151 | rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu); | 157 | tmp = (struct acpi_generic_status *)elog_buf; |
| 158 | |||
| 159 | if (!ras_userspace_consumers()) { | ||
| 160 | print_extlog_rcd(NULL, tmp, cpu); | ||
| 161 | goto out; | ||
| 162 | } | ||
| 163 | |||
| 164 | /* log event via trace */ | ||
| 165 | err_seq++; | ||
| 166 | gdata = (struct acpi_generic_data *)(tmp + 1); | ||
| 167 | if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID) | ||
| 168 | fru_id = (uuid_le *)gdata->fru_id; | ||
| 169 | if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT) | ||
| 170 | fru_text = gdata->fru_text; | ||
| 171 | sec_type = (uuid_le *)gdata->section_type; | ||
| 172 | if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) { | ||
| 173 | struct cper_sec_mem_err *mem = (void *)(gdata + 1); | ||
| 174 | if (gdata->error_data_length >= sizeof(*mem)) | ||
| 175 | trace_extlog_mem_event(mem, err_seq, fru_id, fru_text, | ||
| 176 | (u8)gdata->error_severity); | ||
| 177 | } | ||
| 152 | 178 | ||
| 179 | out: | ||
| 153 | return NOTIFY_STOP; | 180 | return NOTIFY_STOP; |
| 154 | } | 181 | } |
| 155 | 182 | ||
| @@ -196,19 +223,16 @@ static int __init extlog_init(void) | |||
| 196 | u64 cap; | 223 | u64 cap; |
| 197 | int rc; | 224 | int rc; |
| 198 | 225 | ||
| 226 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
| 227 | |||
| 228 | if (!(cap & MCG_ELOG_P) || !extlog_get_l1addr()) | ||
| 229 | return -ENODEV; | ||
| 230 | |||
| 199 | if (get_edac_report_status() == EDAC_REPORTING_FORCE) { | 231 | if (get_edac_report_status() == EDAC_REPORTING_FORCE) { |
| 200 | pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n"); | 232 | pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n"); |
| 201 | return -EPERM; | 233 | return -EPERM; |
| 202 | } | 234 | } |
| 203 | 235 | ||
| 204 | rc = -ENODEV; | ||
| 205 | rdmsrl(MSR_IA32_MCG_CAP, cap); | ||
| 206 | if (!(cap & MCG_ELOG_P)) | ||
| 207 | return rc; | ||
| 208 | |||
| 209 | if (!extlog_get_l1addr()) | ||
| 210 | return rc; | ||
| 211 | |||
| 212 | rc = -EINVAL; | 236 | rc = -EINVAL; |
| 213 | /* get L1 header to fetch necessary information */ | 237 | /* get L1 header to fetch necessary information */ |
| 214 | l1_hdr_size = sizeof(struct extlog_l1_head); | 238 | l1_hdr_size = sizeof(struct extlog_l1_head); |
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig index c4dac7150960..b0140c8fc733 100644 --- a/drivers/acpi/apei/Kconfig +++ b/drivers/acpi/apei/Kconfig | |||
| @@ -1,9 +1,15 @@ | |||
| 1 | config HAVE_ACPI_APEI | ||
| 2 | bool | ||
| 3 | |||
| 4 | config HAVE_ACPI_APEI_NMI | ||
| 5 | bool | ||
| 6 | |||
| 1 | config ACPI_APEI | 7 | config ACPI_APEI |
| 2 | bool "ACPI Platform Error Interface (APEI)" | 8 | bool "ACPI Platform Error Interface (APEI)" |
| 3 | select MISC_FILESYSTEMS | 9 | select MISC_FILESYSTEMS |
| 4 | select PSTORE | 10 | select PSTORE |
| 5 | select UEFI_CPER | 11 | select UEFI_CPER |
| 6 | depends on X86 | 12 | depends on HAVE_ACPI_APEI |
| 7 | help | 13 | help |
| 8 | APEI allows to report errors (for example from the chipset) | 14 | APEI allows to report errors (for example from the chipset) |
| 9 | to the operating system. This improves NMI handling | 15 | to the operating system. This improves NMI handling |
diff --git a/drivers/acpi/apei/apei-base.c b/drivers/acpi/apei/apei-base.c index 8678dfe5366b..2cd7bdd6c8b3 100644 --- a/drivers/acpi/apei/apei-base.c +++ b/drivers/acpi/apei/apei-base.c | |||
| @@ -745,6 +745,19 @@ struct dentry *apei_get_debugfs_dir(void) | |||
| 745 | } | 745 | } |
| 746 | EXPORT_SYMBOL_GPL(apei_get_debugfs_dir); | 746 | EXPORT_SYMBOL_GPL(apei_get_debugfs_dir); |
| 747 | 747 | ||
| 748 | int __weak arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, | ||
| 749 | void *data) | ||
| 750 | { | ||
| 751 | return 1; | ||
| 752 | } | ||
| 753 | EXPORT_SYMBOL_GPL(arch_apei_enable_cmcff); | ||
| 754 | |||
| 755 | void __weak arch_apei_report_mem_error(int sev, | ||
| 756 | struct cper_sec_mem_err *mem_err) | ||
| 757 | { | ||
| 758 | } | ||
| 759 | EXPORT_SYMBOL_GPL(arch_apei_report_mem_error); | ||
| 760 | |||
| 748 | int apei_osc_setup(void) | 761 | int apei_osc_setup(void) |
| 749 | { | 762 | { |
| 750 | static u8 whea_uuid_str[] = "ed855e0c-6c90-47bf-a62a-26de0fc5ad5c"; | 763 | static u8 whea_uuid_str[] = "ed855e0c-6c90-47bf-a62a-26de0fc5ad5c"; |
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index dab7cb7349df..e05d84e7b06d 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c | |||
| @@ -47,11 +47,11 @@ | |||
| 47 | #include <linux/genalloc.h> | 47 | #include <linux/genalloc.h> |
| 48 | #include <linux/pci.h> | 48 | #include <linux/pci.h> |
| 49 | #include <linux/aer.h> | 49 | #include <linux/aer.h> |
| 50 | #include <linux/nmi.h> | ||
| 50 | 51 | ||
| 51 | #include <acpi/ghes.h> | 52 | #include <acpi/ghes.h> |
| 52 | #include <asm/mce.h> | 53 | #include <acpi/apei.h> |
| 53 | #include <asm/tlbflush.h> | 54 | #include <asm/tlbflush.h> |
| 54 | #include <asm/nmi.h> | ||
| 55 | 55 | ||
| 56 | #include "apei-internal.h" | 56 | #include "apei-internal.h" |
| 57 | 57 | ||
| @@ -86,8 +86,6 @@ | |||
| 86 | bool ghes_disable; | 86 | bool ghes_disable; |
| 87 | module_param_named(disable, ghes_disable, bool, 0); | 87 | module_param_named(disable, ghes_disable, bool, 0); |
| 88 | 88 | ||
| 89 | static int ghes_panic_timeout __read_mostly = 30; | ||
| 90 | |||
| 91 | /* | 89 | /* |
| 92 | * All error sources notified with SCI shares one notifier function, | 90 | * All error sources notified with SCI shares one notifier function, |
| 93 | * so they need to be linked and checked one by one. This is applied | 91 | * so they need to be linked and checked one by one. This is applied |
| @@ -97,16 +95,9 @@ static int ghes_panic_timeout __read_mostly = 30; | |||
| 97 | * list changing, not for traversing. | 95 | * list changing, not for traversing. |
| 98 | */ | 96 | */ |
| 99 | static LIST_HEAD(ghes_sci); | 97 | static LIST_HEAD(ghes_sci); |
| 100 | static LIST_HEAD(ghes_nmi); | ||
| 101 | static DEFINE_MUTEX(ghes_list_mutex); | 98 | static DEFINE_MUTEX(ghes_list_mutex); |
| 102 | 99 | ||
| 103 | /* | 100 | /* |
| 104 | * NMI may be triggered on any CPU, so ghes_nmi_lock is used for | ||
| 105 | * mutual exclusion. | ||
| 106 | */ | ||
| 107 | static DEFINE_RAW_SPINLOCK(ghes_nmi_lock); | ||
| 108 | |||
| 109 | /* | ||
| 110 | * Because the memory area used to transfer hardware error information | 101 | * Because the memory area used to transfer hardware error information |
| 111 | * from BIOS to Linux can be determined only in NMI, IRQ or timer | 102 | * from BIOS to Linux can be determined only in NMI, IRQ or timer |
| 112 | * handler, but general ioremap can not be used in atomic context, so | 103 | * handler, but general ioremap can not be used in atomic context, so |
| @@ -114,12 +105,16 @@ static DEFINE_RAW_SPINLOCK(ghes_nmi_lock); | |||
| 114 | */ | 105 | */ |
| 115 | 106 | ||
| 116 | /* | 107 | /* |
| 117 | * Two virtual pages are used, one for NMI context, the other for | 108 | * Two virtual pages are used, one for IRQ/PROCESS context, the other for |
| 118 | * IRQ/PROCESS context | 109 | * NMI context (optionally). |
| 119 | */ | 110 | */ |
| 120 | #define GHES_IOREMAP_PAGES 2 | 111 | #ifdef CONFIG_HAVE_ACPI_APEI_NMI |
| 121 | #define GHES_IOREMAP_NMI_PAGE(base) (base) | 112 | #define GHES_IOREMAP_PAGES 2 |
| 122 | #define GHES_IOREMAP_IRQ_PAGE(base) ((base) + PAGE_SIZE) | 113 | #else |
| 114 | #define GHES_IOREMAP_PAGES 1 | ||
| 115 | #endif | ||
| 116 | #define GHES_IOREMAP_IRQ_PAGE(base) (base) | ||
| 117 | #define GHES_IOREMAP_NMI_PAGE(base) ((base) + PAGE_SIZE) | ||
| 123 | 118 | ||
| 124 | /* virtual memory area for atomic ioremap */ | 119 | /* virtual memory area for atomic ioremap */ |
| 125 | static struct vm_struct *ghes_ioremap_area; | 120 | static struct vm_struct *ghes_ioremap_area; |
| @@ -130,18 +125,8 @@ static struct vm_struct *ghes_ioremap_area; | |||
| 130 | static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi); | 125 | static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi); |
| 131 | static DEFINE_SPINLOCK(ghes_ioremap_lock_irq); | 126 | static DEFINE_SPINLOCK(ghes_ioremap_lock_irq); |
| 132 | 127 | ||
| 133 | /* | ||
| 134 | * printk is not safe in NMI context. So in NMI handler, we allocate | ||
| 135 | * required memory from lock-less memory allocator | ||
| 136 | * (ghes_estatus_pool), save estatus into it, put them into lock-less | ||
| 137 | * list (ghes_estatus_llist), then delay printk into IRQ context via | ||
| 138 | * irq_work (ghes_proc_irq_work). ghes_estatus_size_request record | ||
| 139 | * required pool size by all NMI error source. | ||
| 140 | */ | ||
| 141 | static struct gen_pool *ghes_estatus_pool; | 128 | static struct gen_pool *ghes_estatus_pool; |
| 142 | static unsigned long ghes_estatus_pool_size_request; | 129 | static unsigned long ghes_estatus_pool_size_request; |
| 143 | static struct llist_head ghes_estatus_llist; | ||
| 144 | static struct irq_work ghes_proc_irq_work; | ||
| 145 | 130 | ||
| 146 | struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE]; | 131 | struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE]; |
| 147 | static atomic_t ghes_estatus_cache_alloced; | 132 | static atomic_t ghes_estatus_cache_alloced; |
| @@ -192,7 +177,7 @@ static void ghes_iounmap_nmi(void __iomem *vaddr_ptr) | |||
| 192 | 177 | ||
| 193 | BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base)); | 178 | BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base)); |
| 194 | unmap_kernel_range_noflush(vaddr, PAGE_SIZE); | 179 | unmap_kernel_range_noflush(vaddr, PAGE_SIZE); |
| 195 | __flush_tlb_one(vaddr); | 180 | arch_apei_flush_tlb_one(vaddr); |
| 196 | } | 181 | } |
| 197 | 182 | ||
| 198 | static void ghes_iounmap_irq(void __iomem *vaddr_ptr) | 183 | static void ghes_iounmap_irq(void __iomem *vaddr_ptr) |
| @@ -202,7 +187,7 @@ static void ghes_iounmap_irq(void __iomem *vaddr_ptr) | |||
| 202 | 187 | ||
| 203 | BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base)); | 188 | BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base)); |
| 204 | unmap_kernel_range_noflush(vaddr, PAGE_SIZE); | 189 | unmap_kernel_range_noflush(vaddr, PAGE_SIZE); |
| 205 | __flush_tlb_one(vaddr); | 190 | arch_apei_flush_tlb_one(vaddr); |
| 206 | } | 191 | } |
| 207 | 192 | ||
| 208 | static int ghes_estatus_pool_init(void) | 193 | static int ghes_estatus_pool_init(void) |
| @@ -249,11 +234,6 @@ static int ghes_estatus_pool_expand(unsigned long len) | |||
| 249 | return 0; | 234 | return 0; |
| 250 | } | 235 | } |
| 251 | 236 | ||
| 252 | static void ghes_estatus_pool_shrink(unsigned long len) | ||
| 253 | { | ||
| 254 | ghes_estatus_pool_size_request -= PAGE_ALIGN(len); | ||
| 255 | } | ||
| 256 | |||
| 257 | static struct ghes *ghes_new(struct acpi_hest_generic *generic) | 237 | static struct ghes *ghes_new(struct acpi_hest_generic *generic) |
| 258 | { | 238 | { |
| 259 | struct ghes *ghes; | 239 | struct ghes *ghes; |
| @@ -455,9 +435,7 @@ static void ghes_do_proc(struct ghes *ghes, | |||
| 455 | mem_err = (struct cper_sec_mem_err *)(gdata+1); | 435 | mem_err = (struct cper_sec_mem_err *)(gdata+1); |
| 456 | ghes_edac_report_mem_error(ghes, sev, mem_err); | 436 | ghes_edac_report_mem_error(ghes, sev, mem_err); |
| 457 | 437 | ||
| 458 | #ifdef CONFIG_X86_MCE | 438 | arch_apei_report_mem_error(sev, mem_err); |
| 459 | apei_mce_report_mem_error(sev, mem_err); | ||
| 460 | #endif | ||
| 461 | ghes_handle_memory_failure(gdata, sev); | 439 | ghes_handle_memory_failure(gdata, sev); |
| 462 | } | 440 | } |
| 463 | #ifdef CONFIG_ACPI_APEI_PCIEAER | 441 | #ifdef CONFIG_ACPI_APEI_PCIEAER |
| @@ -734,6 +712,32 @@ static int ghes_notify_sci(struct notifier_block *this, | |||
| 734 | return ret; | 712 | return ret; |
| 735 | } | 713 | } |
| 736 | 714 | ||
| 715 | static struct notifier_block ghes_notifier_sci = { | ||
| 716 | .notifier_call = ghes_notify_sci, | ||
| 717 | }; | ||
| 718 | |||
| 719 | #ifdef CONFIG_HAVE_ACPI_APEI_NMI | ||
| 720 | /* | ||
| 721 | * printk is not safe in NMI context. So in NMI handler, we allocate | ||
| 722 | * required memory from lock-less memory allocator | ||
| 723 | * (ghes_estatus_pool), save estatus into it, put them into lock-less | ||
| 724 | * list (ghes_estatus_llist), then delay printk into IRQ context via | ||
| 725 | * irq_work (ghes_proc_irq_work). ghes_estatus_size_request record | ||
| 726 | * required pool size by all NMI error source. | ||
| 727 | */ | ||
| 728 | static struct llist_head ghes_estatus_llist; | ||
| 729 | static struct irq_work ghes_proc_irq_work; | ||
| 730 | |||
| 731 | /* | ||
| 732 | * NMI may be triggered on any CPU, so ghes_nmi_lock is used for | ||
| 733 | * mutual exclusion. | ||
| 734 | */ | ||
| 735 | static DEFINE_RAW_SPINLOCK(ghes_nmi_lock); | ||
| 736 | |||
| 737 | static LIST_HEAD(ghes_nmi); | ||
| 738 | |||
| 739 | static int ghes_panic_timeout __read_mostly = 30; | ||
| 740 | |||
| 737 | static struct llist_node *llist_nodes_reverse(struct llist_node *llnode) | 741 | static struct llist_node *llist_nodes_reverse(struct llist_node *llnode) |
| 738 | { | 742 | { |
| 739 | struct llist_node *next, *tail = NULL; | 743 | struct llist_node *next, *tail = NULL; |
| @@ -877,10 +881,6 @@ out: | |||
| 877 | return ret; | 881 | return ret; |
| 878 | } | 882 | } |
| 879 | 883 | ||
| 880 | static struct notifier_block ghes_notifier_sci = { | ||
| 881 | .notifier_call = ghes_notify_sci, | ||
| 882 | }; | ||
| 883 | |||
| 884 | static unsigned long ghes_esource_prealloc_size( | 884 | static unsigned long ghes_esource_prealloc_size( |
| 885 | const struct acpi_hest_generic *generic) | 885 | const struct acpi_hest_generic *generic) |
| 886 | { | 886 | { |
| @@ -896,11 +896,71 @@ static unsigned long ghes_esource_prealloc_size( | |||
| 896 | return prealloc_size; | 896 | return prealloc_size; |
| 897 | } | 897 | } |
| 898 | 898 | ||
| 899 | static void ghes_estatus_pool_shrink(unsigned long len) | ||
| 900 | { | ||
| 901 | ghes_estatus_pool_size_request -= PAGE_ALIGN(len); | ||
| 902 | } | ||
| 903 | |||
| 904 | static void ghes_nmi_add(struct ghes *ghes) | ||
| 905 | { | ||
| 906 | unsigned long len; | ||
| 907 | |||
| 908 | len = ghes_esource_prealloc_size(ghes->generic); | ||
| 909 | ghes_estatus_pool_expand(len); | ||
| 910 | mutex_lock(&ghes_list_mutex); | ||
| 911 | if (list_empty(&ghes_nmi)) | ||
| 912 | register_nmi_handler(NMI_LOCAL, ghes_notify_nmi, 0, "ghes"); | ||
| 913 | list_add_rcu(&ghes->list, &ghes_nmi); | ||
| 914 | mutex_unlock(&ghes_list_mutex); | ||
| 915 | } | ||
| 916 | |||
| 917 | static void ghes_nmi_remove(struct ghes *ghes) | ||
| 918 | { | ||
| 919 | unsigned long len; | ||
| 920 | |||
| 921 | mutex_lock(&ghes_list_mutex); | ||
| 922 | list_del_rcu(&ghes->list); | ||
| 923 | if (list_empty(&ghes_nmi)) | ||
| 924 | unregister_nmi_handler(NMI_LOCAL, "ghes"); | ||
| 925 | mutex_unlock(&ghes_list_mutex); | ||
| 926 | /* | ||
| 927 | * To synchronize with NMI handler, ghes can only be | ||
| 928 | * freed after NMI handler finishes. | ||
| 929 | */ | ||
| 930 | synchronize_rcu(); | ||
| 931 | len = ghes_esource_prealloc_size(ghes->generic); | ||
| 932 | ghes_estatus_pool_shrink(len); | ||
| 933 | } | ||
| 934 | |||
| 935 | static void ghes_nmi_init_cxt(void) | ||
| 936 | { | ||
| 937 | init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq); | ||
| 938 | } | ||
| 939 | #else /* CONFIG_HAVE_ACPI_APEI_NMI */ | ||
| 940 | static inline void ghes_nmi_add(struct ghes *ghes) | ||
| 941 | { | ||
| 942 | pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not supported!\n", | ||
| 943 | ghes->generic->header.source_id); | ||
| 944 | BUG(); | ||
| 945 | } | ||
| 946 | |||
| 947 | static inline void ghes_nmi_remove(struct ghes *ghes) | ||
| 948 | { | ||
| 949 | pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not supported!\n", | ||
| 950 | ghes->generic->header.source_id); | ||
| 951 | BUG(); | ||
| 952 | } | ||
| 953 | |||
| 954 | static inline void ghes_nmi_init_cxt(void) | ||
| 955 | { | ||
| 956 | } | ||
| 957 | #endif /* CONFIG_HAVE_ACPI_APEI_NMI */ | ||
| 958 | |||
| 899 | static int ghes_probe(struct platform_device *ghes_dev) | 959 | static int ghes_probe(struct platform_device *ghes_dev) |
| 900 | { | 960 | { |
| 901 | struct acpi_hest_generic *generic; | 961 | struct acpi_hest_generic *generic; |
| 902 | struct ghes *ghes = NULL; | 962 | struct ghes *ghes = NULL; |
| 903 | unsigned long len; | 963 | |
| 904 | int rc = -EINVAL; | 964 | int rc = -EINVAL; |
| 905 | 965 | ||
| 906 | generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data; | 966 | generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data; |
| @@ -911,7 +971,13 @@ static int ghes_probe(struct platform_device *ghes_dev) | |||
| 911 | case ACPI_HEST_NOTIFY_POLLED: | 971 | case ACPI_HEST_NOTIFY_POLLED: |
| 912 | case ACPI_HEST_NOTIFY_EXTERNAL: | 972 | case ACPI_HEST_NOTIFY_EXTERNAL: |
| 913 | case ACPI_HEST_NOTIFY_SCI: | 973 | case ACPI_HEST_NOTIFY_SCI: |
| 974 | break; | ||
| 914 | case ACPI_HEST_NOTIFY_NMI: | 975 | case ACPI_HEST_NOTIFY_NMI: |
| 976 | if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_NMI)) { | ||
| 977 | pr_warn(GHES_PFX "Generic hardware error source: %d notified via NMI interrupt is not supported!\n", | ||
| 978 | generic->header.source_id); | ||
| 979 | goto err; | ||
| 980 | } | ||
| 915 | break; | 981 | break; |
| 916 | case ACPI_HEST_NOTIFY_LOCAL: | 982 | case ACPI_HEST_NOTIFY_LOCAL: |
| 917 | pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n", | 983 | pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n", |
| @@ -972,14 +1038,7 @@ static int ghes_probe(struct platform_device *ghes_dev) | |||
| 972 | mutex_unlock(&ghes_list_mutex); | 1038 | mutex_unlock(&ghes_list_mutex); |
| 973 | break; | 1039 | break; |
| 974 | case ACPI_HEST_NOTIFY_NMI: | 1040 | case ACPI_HEST_NOTIFY_NMI: |
| 975 | len = ghes_esource_prealloc_size(generic); | 1041 | ghes_nmi_add(ghes); |
| 976 | ghes_estatus_pool_expand(len); | ||
| 977 | mutex_lock(&ghes_list_mutex); | ||
| 978 | if (list_empty(&ghes_nmi)) | ||
| 979 | register_nmi_handler(NMI_LOCAL, ghes_notify_nmi, 0, | ||
| 980 | "ghes"); | ||
| 981 | list_add_rcu(&ghes->list, &ghes_nmi); | ||
| 982 | mutex_unlock(&ghes_list_mutex); | ||
| 983 | break; | 1042 | break; |
| 984 | default: | 1043 | default: |
| 985 | BUG(); | 1044 | BUG(); |
| @@ -1001,7 +1060,6 @@ static int ghes_remove(struct platform_device *ghes_dev) | |||
| 1001 | { | 1060 | { |
| 1002 | struct ghes *ghes; | 1061 | struct ghes *ghes; |
| 1003 | struct acpi_hest_generic *generic; | 1062 | struct acpi_hest_generic *generic; |
| 1004 | unsigned long len; | ||
| 1005 | 1063 | ||
| 1006 | ghes = platform_get_drvdata(ghes_dev); | 1064 | ghes = platform_get_drvdata(ghes_dev); |
| 1007 | generic = ghes->generic; | 1065 | generic = ghes->generic; |
| @@ -1022,18 +1080,7 @@ static int ghes_remove(struct platform_device *ghes_dev) | |||
| 1022 | mutex_unlock(&ghes_list_mutex); | 1080 | mutex_unlock(&ghes_list_mutex); |
| 1023 | break; | 1081 | break; |
| 1024 | case ACPI_HEST_NOTIFY_NMI: | 1082 | case ACPI_HEST_NOTIFY_NMI: |
| 1025 | mutex_lock(&ghes_list_mutex); | 1083 | ghes_nmi_remove(ghes); |
| 1026 | list_del_rcu(&ghes->list); | ||
| 1027 | if (list_empty(&ghes_nmi)) | ||
| 1028 | unregister_nmi_handler(NMI_LOCAL, "ghes"); | ||
| 1029 | mutex_unlock(&ghes_list_mutex); | ||
| 1030 | /* | ||
| 1031 | * To synchronize with NMI handler, ghes can only be | ||
| 1032 | * freed after NMI handler finishes. | ||
| 1033 | */ | ||
| 1034 | synchronize_rcu(); | ||
| 1035 | len = ghes_esource_prealloc_size(generic); | ||
| 1036 | ghes_estatus_pool_shrink(len); | ||
| 1037 | break; | 1084 | break; |
| 1038 | default: | 1085 | default: |
| 1039 | BUG(); | 1086 | BUG(); |
| @@ -1077,7 +1124,7 @@ static int __init ghes_init(void) | |||
| 1077 | return -EINVAL; | 1124 | return -EINVAL; |
| 1078 | } | 1125 | } |
| 1079 | 1126 | ||
| 1080 | init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq); | 1127 | ghes_nmi_init_cxt(); |
| 1081 | 1128 | ||
| 1082 | rc = ghes_ioremap_init(); | 1129 | rc = ghes_ioremap_init(); |
| 1083 | if (rc) | 1130 | if (rc) |
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c index f5e37f32c71f..06e9b411a0a2 100644 --- a/drivers/acpi/apei/hest.c +++ b/drivers/acpi/apei/hest.c | |||
| @@ -36,7 +36,6 @@ | |||
| 36 | #include <linux/io.h> | 36 | #include <linux/io.h> |
| 37 | #include <linux/platform_device.h> | 37 | #include <linux/platform_device.h> |
| 38 | #include <acpi/apei.h> | 38 | #include <acpi/apei.h> |
| 39 | #include <asm/mce.h> | ||
| 40 | 39 | ||
| 41 | #include "apei-internal.h" | 40 | #include "apei-internal.h" |
| 42 | 41 | ||
| @@ -128,33 +127,7 @@ EXPORT_SYMBOL_GPL(apei_hest_parse); | |||
| 128 | */ | 127 | */ |
| 129 | static int __init hest_parse_cmc(struct acpi_hest_header *hest_hdr, void *data) | 128 | static int __init hest_parse_cmc(struct acpi_hest_header *hest_hdr, void *data) |
| 130 | { | 129 | { |
| 131 | #ifdef CONFIG_X86_MCE | 130 | return arch_apei_enable_cmcff(hest_hdr, data); |
| 132 | int i; | ||
| 133 | struct acpi_hest_ia_corrected *cmc; | ||
| 134 | struct acpi_hest_ia_error_bank *mc_bank; | ||
| 135 | |||
| 136 | if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) | ||
| 137 | return 0; | ||
| 138 | |||
| 139 | cmc = (struct acpi_hest_ia_corrected *)hest_hdr; | ||
| 140 | if (!cmc->enabled) | ||
| 141 | return 0; | ||
| 142 | |||
| 143 | /* | ||
| 144 | * We expect HEST to provide a list of MC banks that report errors | ||
| 145 | * in firmware first mode. Otherwise, return non-zero value to | ||
| 146 | * indicate that we are done parsing HEST. | ||
| 147 | */ | ||
| 148 | if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || !cmc->num_hardware_banks) | ||
| 149 | return 1; | ||
| 150 | |||
| 151 | pr_info(HEST_PFX "Enabling Firmware First mode for corrected errors.\n"); | ||
| 152 | |||
| 153 | mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1); | ||
| 154 | for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++) | ||
| 155 | mce_disable_bank(mc_bank->bank_number); | ||
| 156 | #endif | ||
| 157 | return 1; | ||
| 158 | } | 131 | } |
| 159 | 132 | ||
| 160 | struct ghes_arr { | 133 | struct ghes_arr { |
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index e339c6b91425..f8665f9c3e03 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig | |||
| @@ -72,6 +72,7 @@ config EDAC_MCE_INJ | |||
| 72 | 72 | ||
| 73 | config EDAC_MM_EDAC | 73 | config EDAC_MM_EDAC |
| 74 | tristate "Main Memory EDAC (Error Detection And Correction) reporting" | 74 | tristate "Main Memory EDAC (Error Detection And Correction) reporting" |
| 75 | select RAS | ||
| 75 | help | 76 | help |
| 76 | Some systems are able to detect and correct errors in main | 77 | Some systems are able to detect and correct errors in main |
| 77 | memory. EDAC can report statistics on memory error | 78 | memory. EDAC can report statistics on memory error |
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 2c694b5297cc..9f134823fa75 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c | |||
| @@ -33,9 +33,6 @@ | |||
| 33 | #include <asm/edac.h> | 33 | #include <asm/edac.h> |
| 34 | #include "edac_core.h" | 34 | #include "edac_core.h" |
| 35 | #include "edac_module.h" | 35 | #include "edac_module.h" |
| 36 | |||
| 37 | #define CREATE_TRACE_POINTS | ||
| 38 | #define TRACE_INCLUDE_PATH ../../include/ras | ||
| 39 | #include <ras/ras_event.h> | 36 | #include <ras/ras_event.h> |
| 40 | 37 | ||
| 41 | /* lock to memory controller's control array */ | 38 | /* lock to memory controller's control array */ |
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 1491dd4f08f9..437e6fd47311 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c | |||
| @@ -34,6 +34,9 @@ | |||
| 34 | #include <linux/aer.h> | 34 | #include <linux/aer.h> |
| 35 | 35 | ||
| 36 | #define INDENT_SP " " | 36 | #define INDENT_SP " " |
| 37 | |||
| 38 | static char rcd_decode_str[CPER_REC_LEN]; | ||
| 39 | |||
| 37 | /* | 40 | /* |
| 38 | * CPER record ID need to be unique even after reboot, because record | 41 | * CPER record ID need to be unique even after reboot, because record |
| 39 | * ID is used as index for ERST storage, while CPER records from | 42 | * ID is used as index for ERST storage, while CPER records from |
| @@ -50,18 +53,19 @@ u64 cper_next_record_id(void) | |||
| 50 | } | 53 | } |
| 51 | EXPORT_SYMBOL_GPL(cper_next_record_id); | 54 | EXPORT_SYMBOL_GPL(cper_next_record_id); |
| 52 | 55 | ||
| 53 | static const char *cper_severity_strs[] = { | 56 | static const char * const severity_strs[] = { |
| 54 | "recoverable", | 57 | "recoverable", |
| 55 | "fatal", | 58 | "fatal", |
| 56 | "corrected", | 59 | "corrected", |
| 57 | "info", | 60 | "info", |
| 58 | }; | 61 | }; |
| 59 | 62 | ||
| 60 | static const char *cper_severity_str(unsigned int severity) | 63 | const char *cper_severity_str(unsigned int severity) |
| 61 | { | 64 | { |
| 62 | return severity < ARRAY_SIZE(cper_severity_strs) ? | 65 | return severity < ARRAY_SIZE(severity_strs) ? |
| 63 | cper_severity_strs[severity] : "unknown"; | 66 | severity_strs[severity] : "unknown"; |
| 64 | } | 67 | } |
| 68 | EXPORT_SYMBOL_GPL(cper_severity_str); | ||
| 65 | 69 | ||
| 66 | /* | 70 | /* |
| 67 | * cper_print_bits - print strings for set bits | 71 | * cper_print_bits - print strings for set bits |
| @@ -100,32 +104,32 @@ void cper_print_bits(const char *pfx, unsigned int bits, | |||
| 100 | printk("%s\n", buf); | 104 | printk("%s\n", buf); |
| 101 | } | 105 | } |
| 102 | 106 | ||
| 103 | static const char * const cper_proc_type_strs[] = { | 107 | static const char * const proc_type_strs[] = { |
| 104 | "IA32/X64", | 108 | "IA32/X64", |
| 105 | "IA64", | 109 | "IA64", |
| 106 | }; | 110 | }; |
| 107 | 111 | ||
| 108 | static const char * const cper_proc_isa_strs[] = { | 112 | static const char * const proc_isa_strs[] = { |
| 109 | "IA32", | 113 | "IA32", |
| 110 | "IA64", | 114 | "IA64", |
| 111 | "X64", | 115 | "X64", |
| 112 | }; | 116 | }; |
| 113 | 117 | ||
| 114 | static const char * const cper_proc_error_type_strs[] = { | 118 | static const char * const proc_error_type_strs[] = { |
| 115 | "cache error", | 119 | "cache error", |
| 116 | "TLB error", | 120 | "TLB error", |
| 117 | "bus error", | 121 | "bus error", |
| 118 | "micro-architectural error", | 122 | "micro-architectural error", |
| 119 | }; | 123 | }; |
| 120 | 124 | ||
| 121 | static const char * const cper_proc_op_strs[] = { | 125 | static const char * const proc_op_strs[] = { |
| 122 | "unknown or generic", | 126 | "unknown or generic", |
| 123 | "data read", | 127 | "data read", |
| 124 | "data write", | 128 | "data write", |
| 125 | "instruction execution", | 129 | "instruction execution", |
| 126 | }; | 130 | }; |
| 127 | 131 | ||
| 128 | static const char * const cper_proc_flag_strs[] = { | 132 | static const char * const proc_flag_strs[] = { |
| 129 | "restartable", | 133 | "restartable", |
| 130 | "precise IP", | 134 | "precise IP", |
| 131 | "overflow", | 135 | "overflow", |
| @@ -137,26 +141,26 @@ static void cper_print_proc_generic(const char *pfx, | |||
| 137 | { | 141 | { |
| 138 | if (proc->validation_bits & CPER_PROC_VALID_TYPE) | 142 | if (proc->validation_bits & CPER_PROC_VALID_TYPE) |
| 139 | printk("%s""processor_type: %d, %s\n", pfx, proc->proc_type, | 143 | printk("%s""processor_type: %d, %s\n", pfx, proc->proc_type, |
| 140 | proc->proc_type < ARRAY_SIZE(cper_proc_type_strs) ? | 144 | proc->proc_type < ARRAY_SIZE(proc_type_strs) ? |
| 141 | cper_proc_type_strs[proc->proc_type] : "unknown"); | 145 | proc_type_strs[proc->proc_type] : "unknown"); |
| 142 | if (proc->validation_bits & CPER_PROC_VALID_ISA) | 146 | if (proc->validation_bits & CPER_PROC_VALID_ISA) |
| 143 | printk("%s""processor_isa: %d, %s\n", pfx, proc->proc_isa, | 147 | printk("%s""processor_isa: %d, %s\n", pfx, proc->proc_isa, |
| 144 | proc->proc_isa < ARRAY_SIZE(cper_proc_isa_strs) ? | 148 | proc->proc_isa < ARRAY_SIZE(proc_isa_strs) ? |
| 145 | cper_proc_isa_strs[proc->proc_isa] : "unknown"); | 149 | proc_isa_strs[proc->proc_isa] : "unknown"); |
| 146 | if (proc->validation_bits & CPER_PROC_VALID_ERROR_TYPE) { | 150 | if (proc->validation_bits & CPER_PROC_VALID_ERROR_TYPE) { |
| 147 | printk("%s""error_type: 0x%02x\n", pfx, proc->proc_error_type); | 151 | printk("%s""error_type: 0x%02x\n", pfx, proc->proc_error_type); |
| 148 | cper_print_bits(pfx, proc->proc_error_type, | 152 | cper_print_bits(pfx, proc->proc_error_type, |
| 149 | cper_proc_error_type_strs, | 153 | proc_error_type_strs, |
| 150 | ARRAY_SIZE(cper_proc_error_type_strs)); | 154 | ARRAY_SIZE(proc_error_type_strs)); |
| 151 | } | 155 | } |
| 152 | if (proc->validation_bits & CPER_PROC_VALID_OPERATION) | 156 | if (proc->validation_bits & CPER_PROC_VALID_OPERATION) |
| 153 | printk("%s""operation: %d, %s\n", pfx, proc->operation, | 157 | printk("%s""operation: %d, %s\n", pfx, proc->operation, |
| 154 | proc->operation < ARRAY_SIZE(cper_proc_op_strs) ? | 158 | proc->operation < ARRAY_SIZE(proc_op_strs) ? |
| 155 | cper_proc_op_strs[proc->operation] : "unknown"); | 159 | proc_op_strs[proc->operation] : "unknown"); |
| 156 | if (proc->validation_bits & CPER_PROC_VALID_FLAGS) { | 160 | if (proc->validation_bits & CPER_PROC_VALID_FLAGS) { |
| 157 | printk("%s""flags: 0x%02x\n", pfx, proc->flags); | 161 | printk("%s""flags: 0x%02x\n", pfx, proc->flags); |
| 158 | cper_print_bits(pfx, proc->flags, cper_proc_flag_strs, | 162 | cper_print_bits(pfx, proc->flags, proc_flag_strs, |
| 159 | ARRAY_SIZE(cper_proc_flag_strs)); | 163 | ARRAY_SIZE(proc_flag_strs)); |
| 160 | } | 164 | } |
| 161 | if (proc->validation_bits & CPER_PROC_VALID_LEVEL) | 165 | if (proc->validation_bits & CPER_PROC_VALID_LEVEL) |
| 162 | printk("%s""level: %d\n", pfx, proc->level); | 166 | printk("%s""level: %d\n", pfx, proc->level); |
| @@ -177,7 +181,7 @@ static void cper_print_proc_generic(const char *pfx, | |||
| 177 | printk("%s""IP: 0x%016llx\n", pfx, proc->ip); | 181 | printk("%s""IP: 0x%016llx\n", pfx, proc->ip); |
| 178 | } | 182 | } |
| 179 | 183 | ||
| 180 | static const char *cper_mem_err_type_strs[] = { | 184 | static const char * const mem_err_type_strs[] = { |
| 181 | "unknown", | 185 | "unknown", |
| 182 | "no error", | 186 | "no error", |
| 183 | "single-bit ECC", | 187 | "single-bit ECC", |
| @@ -196,58 +200,136 @@ static const char *cper_mem_err_type_strs[] = { | |||
| 196 | "physical memory map-out event", | 200 | "physical memory map-out event", |
| 197 | }; | 201 | }; |
| 198 | 202 | ||
| 199 | static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem) | 203 | const char *cper_mem_err_type_str(unsigned int etype) |
| 200 | { | 204 | { |
| 201 | if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS) | 205 | return etype < ARRAY_SIZE(mem_err_type_strs) ? |
| 202 | printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status); | 206 | mem_err_type_strs[etype] : "unknown"; |
| 203 | if (mem->validation_bits & CPER_MEM_VALID_PA) | 207 | } |
| 204 | printk("%s""physical_address: 0x%016llx\n", | 208 | EXPORT_SYMBOL_GPL(cper_mem_err_type_str); |
| 205 | pfx, mem->physical_addr); | 209 | |
| 206 | if (mem->validation_bits & CPER_MEM_VALID_PA_MASK) | 210 | static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg) |
| 207 | printk("%s""physical_address_mask: 0x%016llx\n", | 211 | { |
| 208 | pfx, mem->physical_addr_mask); | 212 | u32 len, n; |
| 213 | |||
| 214 | if (!msg) | ||
| 215 | return 0; | ||
| 216 | |||
| 217 | n = 0; | ||
| 218 | len = CPER_REC_LEN - 1; | ||
| 209 | if (mem->validation_bits & CPER_MEM_VALID_NODE) | 219 | if (mem->validation_bits & CPER_MEM_VALID_NODE) |
| 210 | pr_debug("node: %d\n", mem->node); | 220 | n += scnprintf(msg + n, len - n, "node: %d ", mem->node); |
| 211 | if (mem->validation_bits & CPER_MEM_VALID_CARD) | 221 | if (mem->validation_bits & CPER_MEM_VALID_CARD) |
| 212 | pr_debug("card: %d\n", mem->card); | 222 | n += scnprintf(msg + n, len - n, "card: %d ", mem->card); |
| 213 | if (mem->validation_bits & CPER_MEM_VALID_MODULE) | 223 | if (mem->validation_bits & CPER_MEM_VALID_MODULE) |
| 214 | pr_debug("module: %d\n", mem->module); | 224 | n += scnprintf(msg + n, len - n, "module: %d ", mem->module); |
| 215 | if (mem->validation_bits & CPER_MEM_VALID_RANK_NUMBER) | 225 | if (mem->validation_bits & CPER_MEM_VALID_RANK_NUMBER) |
| 216 | pr_debug("rank: %d\n", mem->rank); | 226 | n += scnprintf(msg + n, len - n, "rank: %d ", mem->rank); |
| 217 | if (mem->validation_bits & CPER_MEM_VALID_BANK) | 227 | if (mem->validation_bits & CPER_MEM_VALID_BANK) |
| 218 | pr_debug("bank: %d\n", mem->bank); | 228 | n += scnprintf(msg + n, len - n, "bank: %d ", mem->bank); |
| 219 | if (mem->validation_bits & CPER_MEM_VALID_DEVICE) | 229 | if (mem->validation_bits & CPER_MEM_VALID_DEVICE) |
| 220 | pr_debug("device: %d\n", mem->device); | 230 | n += scnprintf(msg + n, len - n, "device: %d ", mem->device); |
| 221 | if (mem->validation_bits & CPER_MEM_VALID_ROW) | 231 | if (mem->validation_bits & CPER_MEM_VALID_ROW) |
| 222 | pr_debug("row: %d\n", mem->row); | 232 | n += scnprintf(msg + n, len - n, "row: %d ", mem->row); |
| 223 | if (mem->validation_bits & CPER_MEM_VALID_COLUMN) | 233 | if (mem->validation_bits & CPER_MEM_VALID_COLUMN) |
| 224 | pr_debug("column: %d\n", mem->column); | 234 | n += scnprintf(msg + n, len - n, "column: %d ", mem->column); |
| 225 | if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION) | 235 | if (mem->validation_bits & CPER_MEM_VALID_BIT_POSITION) |
| 226 | pr_debug("bit_position: %d\n", mem->bit_pos); | 236 | n += scnprintf(msg + n, len - n, "bit_position: %d ", |
| 237 | mem->bit_pos); | ||
| 227 | if (mem->validation_bits & CPER_MEM_VALID_REQUESTOR_ID) | 238 | if (mem->validation_bits & CPER_MEM_VALID_REQUESTOR_ID) |
| 228 | pr_debug("requestor_id: 0x%016llx\n", mem->requestor_id); | 239 | n += scnprintf(msg + n, len - n, "requestor_id: 0x%016llx ", |
| 240 | mem->requestor_id); | ||
| 229 | if (mem->validation_bits & CPER_MEM_VALID_RESPONDER_ID) | 241 | if (mem->validation_bits & CPER_MEM_VALID_RESPONDER_ID) |
| 230 | pr_debug("responder_id: 0x%016llx\n", mem->responder_id); | 242 | n += scnprintf(msg + n, len - n, "responder_id: 0x%016llx ", |
| 243 | mem->responder_id); | ||
| 231 | if (mem->validation_bits & CPER_MEM_VALID_TARGET_ID) | 244 | if (mem->validation_bits & CPER_MEM_VALID_TARGET_ID) |
| 232 | pr_debug("target_id: 0x%016llx\n", mem->target_id); | 245 | scnprintf(msg + n, len - n, "target_id: 0x%016llx ", |
| 246 | mem->target_id); | ||
| 247 | |||
| 248 | msg[n] = '\0'; | ||
| 249 | return n; | ||
| 250 | } | ||
| 251 | |||
| 252 | static int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg) | ||
| 253 | { | ||
| 254 | u32 len, n; | ||
| 255 | const char *bank = NULL, *device = NULL; | ||
| 256 | |||
| 257 | if (!msg || !(mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE)) | ||
| 258 | return 0; | ||
| 259 | |||
| 260 | n = 0; | ||
| 261 | len = CPER_REC_LEN - 1; | ||
| 262 | dmi_memdev_name(mem->mem_dev_handle, &bank, &device); | ||
| 263 | if (bank && device) | ||
| 264 | n = snprintf(msg, len, "DIMM location: %s %s ", bank, device); | ||
| 265 | else | ||
| 266 | n = snprintf(msg, len, | ||
| 267 | "DIMM location: not present. DMI handle: 0x%.4x ", | ||
| 268 | mem->mem_dev_handle); | ||
| 269 | |||
| 270 | msg[n] = '\0'; | ||
| 271 | return n; | ||
| 272 | } | ||
| 273 | |||
| 274 | void cper_mem_err_pack(const struct cper_sec_mem_err *mem, | ||
| 275 | struct cper_mem_err_compact *cmem) | ||
| 276 | { | ||
| 277 | cmem->validation_bits = mem->validation_bits; | ||
| 278 | cmem->node = mem->node; | ||
| 279 | cmem->card = mem->card; | ||
| 280 | cmem->module = mem->module; | ||
| 281 | cmem->bank = mem->bank; | ||
| 282 | cmem->device = mem->device; | ||
| 283 | cmem->row = mem->row; | ||
| 284 | cmem->column = mem->column; | ||
| 285 | cmem->bit_pos = mem->bit_pos; | ||
| 286 | cmem->requestor_id = mem->requestor_id; | ||
| 287 | cmem->responder_id = mem->responder_id; | ||
| 288 | cmem->target_id = mem->target_id; | ||
| 289 | cmem->rank = mem->rank; | ||
| 290 | cmem->mem_array_handle = mem->mem_array_handle; | ||
| 291 | cmem->mem_dev_handle = mem->mem_dev_handle; | ||
| 292 | } | ||
| 293 | |||
| 294 | const char *cper_mem_err_unpack(struct trace_seq *p, | ||
| 295 | struct cper_mem_err_compact *cmem) | ||
| 296 | { | ||
| 297 | const char *ret = p->buffer + p->len; | ||
| 298 | |||
| 299 | if (cper_mem_err_location(cmem, rcd_decode_str)) | ||
| 300 | trace_seq_printf(p, "%s", rcd_decode_str); | ||
| 301 | if (cper_dimm_err_location(cmem, rcd_decode_str)) | ||
| 302 | trace_seq_printf(p, "%s", rcd_decode_str); | ||
| 303 | trace_seq_putc(p, '\0'); | ||
| 304 | |||
| 305 | return ret; | ||
| 306 | } | ||
| 307 | |||
| 308 | static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem) | ||
| 309 | { | ||
| 310 | struct cper_mem_err_compact cmem; | ||
| 311 | |||
| 312 | if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS) | ||
| 313 | printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status); | ||
| 314 | if (mem->validation_bits & CPER_MEM_VALID_PA) | ||
| 315 | printk("%s""physical_address: 0x%016llx\n", | ||
| 316 | pfx, mem->physical_addr); | ||
| 317 | if (mem->validation_bits & CPER_MEM_VALID_PA_MASK) | ||
| 318 | printk("%s""physical_address_mask: 0x%016llx\n", | ||
| 319 | pfx, mem->physical_addr_mask); | ||
| 320 | cper_mem_err_pack(mem, &cmem); | ||
| 321 | if (cper_mem_err_location(&cmem, rcd_decode_str)) | ||
| 322 | printk("%s%s\n", pfx, rcd_decode_str); | ||
| 233 | if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { | 323 | if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { |
| 234 | u8 etype = mem->error_type; | 324 | u8 etype = mem->error_type; |
| 235 | printk("%s""error_type: %d, %s\n", pfx, etype, | 325 | printk("%s""error_type: %d, %s\n", pfx, etype, |
| 236 | etype < ARRAY_SIZE(cper_mem_err_type_strs) ? | 326 | cper_mem_err_type_str(etype)); |
| 237 | cper_mem_err_type_strs[etype] : "unknown"); | ||
| 238 | } | ||
| 239 | if (mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) { | ||
| 240 | const char *bank = NULL, *device = NULL; | ||
| 241 | dmi_memdev_name(mem->mem_dev_handle, &bank, &device); | ||
| 242 | if (bank != NULL && device != NULL) | ||
| 243 | printk("%s""DIMM location: %s %s", pfx, bank, device); | ||
| 244 | else | ||
| 245 | printk("%s""DIMM DMI handle: 0x%.4x", | ||
| 246 | pfx, mem->mem_dev_handle); | ||
| 247 | } | 327 | } |
| 328 | if (cper_dimm_err_location(&cmem, rcd_decode_str)) | ||
| 329 | printk("%s%s\n", pfx, rcd_decode_str); | ||
| 248 | } | 330 | } |
| 249 | 331 | ||
| 250 | static const char *cper_pcie_port_type_strs[] = { | 332 | static const char * const pcie_port_type_strs[] = { |
| 251 | "PCIe end point", | 333 | "PCIe end point", |
| 252 | "legacy PCI end point", | 334 | "legacy PCI end point", |
| 253 | "unknown", | 335 | "unknown", |
| @@ -266,8 +348,8 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie, | |||
| 266 | { | 348 | { |
| 267 | if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE) | 349 | if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE) |
| 268 | printk("%s""port_type: %d, %s\n", pfx, pcie->port_type, | 350 | printk("%s""port_type: %d, %s\n", pfx, pcie->port_type, |
| 269 | pcie->port_type < ARRAY_SIZE(cper_pcie_port_type_strs) ? | 351 | pcie->port_type < ARRAY_SIZE(pcie_port_type_strs) ? |
| 270 | cper_pcie_port_type_strs[pcie->port_type] : "unknown"); | 352 | pcie_port_type_strs[pcie->port_type] : "unknown"); |
| 271 | if (pcie->validation_bits & CPER_PCIE_VALID_VERSION) | 353 | if (pcie->validation_bits & CPER_PCIE_VALID_VERSION) |
| 272 | printk("%s""version: %d.%d\n", pfx, | 354 | printk("%s""version: %d.%d\n", pfx, |
| 273 | pcie->version.major, pcie->version.minor); | 355 | pcie->version.major, pcie->version.minor); |
diff --git a/drivers/pci/pcie/aer/Kconfig b/drivers/pci/pcie/aer/Kconfig index 50e94e02378a..389440228c1d 100644 --- a/drivers/pci/pcie/aer/Kconfig +++ b/drivers/pci/pcie/aer/Kconfig | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | config PCIEAER | 5 | config PCIEAER |
| 6 | boolean "Root Port Advanced Error Reporting support" | 6 | boolean "Root Port Advanced Error Reporting support" |
| 7 | depends on PCIEPORTBUS | 7 | depends on PCIEPORTBUS |
| 8 | select RAS | ||
| 8 | default y | 9 | default y |
| 9 | help | 10 | help |
| 10 | This enables PCI Express Root Port Advanced Error Reporting | 11 | This enables PCI Express Root Port Advanced Error Reporting |
diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c index 36ed31b52198..35d06e177917 100644 --- a/drivers/pci/pcie/aer/aerdrv_errprint.c +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c | |||
| @@ -22,9 +22,7 @@ | |||
| 22 | #include <linux/cper.h> | 22 | #include <linux/cper.h> |
| 23 | 23 | ||
| 24 | #include "aerdrv.h" | 24 | #include "aerdrv.h" |
| 25 | 25 | #include <ras/ras_event.h> | |
| 26 | #define CREATE_TRACE_POINTS | ||
| 27 | #include <trace/events/ras.h> | ||
| 28 | 26 | ||
| 29 | #define AER_AGENT_RECEIVER 0 | 27 | #define AER_AGENT_RECEIVER 0 |
| 30 | #define AER_AGENT_REQUESTER 1 | 28 | #define AER_AGENT_REQUESTER 1 |
diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig new file mode 100644 index 000000000000..f9da613052c2 --- /dev/null +++ b/drivers/ras/Kconfig | |||
| @@ -0,0 +1,2 @@ | |||
| 1 | config RAS | ||
| 2 | bool | ||
diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile new file mode 100644 index 000000000000..d7f73341ced3 --- /dev/null +++ b/drivers/ras/Makefile | |||
| @@ -0,0 +1 @@ | |||
| obj-$(CONFIG_RAS) += ras.o debugfs.o | |||
diff --git a/drivers/ras/debugfs.c b/drivers/ras/debugfs.c new file mode 100644 index 000000000000..0322acf67ea5 --- /dev/null +++ b/drivers/ras/debugfs.c | |||
| @@ -0,0 +1,56 @@ | |||
| 1 | #include <linux/debugfs.h> | ||
| 2 | |||
| 3 | static struct dentry *ras_debugfs_dir; | ||
| 4 | |||
| 5 | static atomic_t trace_count = ATOMIC_INIT(0); | ||
| 6 | |||
| 7 | int ras_userspace_consumers(void) | ||
| 8 | { | ||
| 9 | return atomic_read(&trace_count); | ||
| 10 | } | ||
| 11 | EXPORT_SYMBOL_GPL(ras_userspace_consumers); | ||
| 12 | |||
| 13 | static int trace_show(struct seq_file *m, void *v) | ||
| 14 | { | ||
| 15 | return atomic_read(&trace_count); | ||
| 16 | } | ||
| 17 | |||
| 18 | static int trace_open(struct inode *inode, struct file *file) | ||
| 19 | { | ||
| 20 | atomic_inc(&trace_count); | ||
| 21 | return single_open(file, trace_show, NULL); | ||
| 22 | } | ||
| 23 | |||
| 24 | static int trace_release(struct inode *inode, struct file *file) | ||
| 25 | { | ||
| 26 | atomic_dec(&trace_count); | ||
| 27 | return single_release(inode, file); | ||
| 28 | } | ||
| 29 | |||
| 30 | static const struct file_operations trace_fops = { | ||
| 31 | .open = trace_open, | ||
| 32 | .read = seq_read, | ||
| 33 | .llseek = seq_lseek, | ||
| 34 | .release = trace_release, | ||
| 35 | }; | ||
| 36 | |||
| 37 | int __init ras_add_daemon_trace(void) | ||
| 38 | { | ||
| 39 | struct dentry *fentry; | ||
| 40 | |||
| 41 | if (!ras_debugfs_dir) | ||
| 42 | return -ENOENT; | ||
| 43 | |||
| 44 | fentry = debugfs_create_file("daemon_active", S_IRUSR, ras_debugfs_dir, | ||
| 45 | NULL, &trace_fops); | ||
| 46 | if (!fentry) | ||
| 47 | return -ENODEV; | ||
| 48 | |||
| 49 | return 0; | ||
| 50 | |||
| 51 | } | ||
| 52 | |||
| 53 | void __init ras_debugfs_init(void) | ||
| 54 | { | ||
| 55 | ras_debugfs_dir = debugfs_create_dir("ras", NULL); | ||
| 56 | } | ||
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c new file mode 100644 index 000000000000..b67dd362b7b6 --- /dev/null +++ b/drivers/ras/ras.c | |||
| @@ -0,0 +1,29 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2014 Intel Corporation | ||
| 3 | * | ||
| 4 | * Authors: | ||
| 5 | * Chen, Gong <gong.chen@linux.intel.com> | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <linux/init.h> | ||
| 9 | #include <linux/ras.h> | ||
| 10 | |||
| 11 | #define CREATE_TRACE_POINTS | ||
| 12 | #define TRACE_INCLUDE_PATH ../../include/ras | ||
| 13 | #include <ras/ras_event.h> | ||
| 14 | |||
| 15 | static int __init ras_init(void) | ||
| 16 | { | ||
| 17 | int rc = 0; | ||
| 18 | |||
| 19 | ras_debugfs_init(); | ||
| 20 | rc = ras_add_daemon_trace(); | ||
| 21 | |||
| 22 | return rc; | ||
| 23 | } | ||
| 24 | subsys_initcall(ras_init); | ||
| 25 | |||
| 26 | #if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE) | ||
| 27 | EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event); | ||
| 28 | #endif | ||
| 29 | EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event); | ||
diff --git a/include/acpi/apei.h b/include/acpi/apei.h index 04f349d8da73..76284bb560a6 100644 --- a/include/acpi/apei.h +++ b/include/acpi/apei.h | |||
| @@ -42,5 +42,9 @@ ssize_t erst_read(u64 record_id, struct cper_record_header *record, | |||
| 42 | size_t buflen); | 42 | size_t buflen); |
| 43 | int erst_clear(u64 record_id); | 43 | int erst_clear(u64 record_id); |
| 44 | 44 | ||
| 45 | int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data); | ||
| 46 | void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err); | ||
| 47 | void arch_apei_flush_tlb_one(unsigned long addr); | ||
| 48 | |||
| 45 | #endif | 49 | #endif |
| 46 | #endif | 50 | #endif |
diff --git a/include/linux/aer.h b/include/linux/aer.h index 4dbaa7081530..c826d1c28f9c 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h | |||
| @@ -11,6 +11,8 @@ | |||
| 11 | #define AER_FATAL 1 | 11 | #define AER_FATAL 1 |
| 12 | #define AER_CORRECTABLE 2 | 12 | #define AER_CORRECTABLE 2 |
| 13 | 13 | ||
| 14 | struct pci_dev; | ||
| 15 | |||
| 14 | struct aer_header_log_regs { | 16 | struct aer_header_log_regs { |
| 15 | unsigned int dw0; | 17 | unsigned int dw0; |
| 16 | unsigned int dw1; | 18 | unsigned int dw1; |
diff --git a/include/linux/cper.h b/include/linux/cper.h index 2fc0ec3d89cc..76abba4b238e 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #define LINUX_CPER_H | 22 | #define LINUX_CPER_H |
| 23 | 23 | ||
| 24 | #include <linux/uuid.h> | 24 | #include <linux/uuid.h> |
| 25 | #include <linux/trace_seq.h> | ||
| 25 | 26 | ||
| 26 | /* CPER record signature and the size */ | 27 | /* CPER record signature and the size */ |
| 27 | #define CPER_SIG_RECORD "CPER" | 28 | #define CPER_SIG_RECORD "CPER" |
| @@ -36,6 +37,13 @@ | |||
| 36 | #define CPER_RECORD_REV 0x0100 | 37 | #define CPER_RECORD_REV 0x0100 |
| 37 | 38 | ||
| 38 | /* | 39 | /* |
| 40 | * CPER record length contains the CPER fields which are relevant for further | ||
| 41 | * handling of a memory error in userspace (we don't carry all the fields | ||
| 42 | * defined in the UEFI spec because some of them don't make any sense.) | ||
| 43 | * Currently, a length of 256 should be more than enough. | ||
| 44 | */ | ||
| 45 | #define CPER_REC_LEN 256 | ||
| 46 | /* | ||
| 39 | * Severity difinition for error_severity in struct cper_record_header | 47 | * Severity difinition for error_severity in struct cper_record_header |
| 40 | * and section_severity in struct cper_section_descriptor | 48 | * and section_severity in struct cper_section_descriptor |
| 41 | */ | 49 | */ |
| @@ -356,6 +364,24 @@ struct cper_sec_mem_err { | |||
| 356 | __u16 mem_dev_handle; /* module handle in UEFI 2.4 */ | 364 | __u16 mem_dev_handle; /* module handle in UEFI 2.4 */ |
| 357 | }; | 365 | }; |
| 358 | 366 | ||
| 367 | struct cper_mem_err_compact { | ||
| 368 | __u64 validation_bits; | ||
| 369 | __u16 node; | ||
| 370 | __u16 card; | ||
| 371 | __u16 module; | ||
| 372 | __u16 bank; | ||
| 373 | __u16 device; | ||
| 374 | __u16 row; | ||
| 375 | __u16 column; | ||
| 376 | __u16 bit_pos; | ||
| 377 | __u64 requestor_id; | ||
| 378 | __u64 responder_id; | ||
| 379 | __u64 target_id; | ||
| 380 | __u16 rank; | ||
| 381 | __u16 mem_array_handle; | ||
| 382 | __u16 mem_dev_handle; | ||
| 383 | }; | ||
| 384 | |||
| 359 | struct cper_sec_pcie { | 385 | struct cper_sec_pcie { |
| 360 | __u64 validation_bits; | 386 | __u64 validation_bits; |
| 361 | __u32 port_type; | 387 | __u32 port_type; |
| @@ -395,7 +421,13 @@ struct cper_sec_pcie { | |||
| 395 | #pragma pack() | 421 | #pragma pack() |
| 396 | 422 | ||
| 397 | u64 cper_next_record_id(void); | 423 | u64 cper_next_record_id(void); |
| 424 | const char *cper_severity_str(unsigned int); | ||
| 425 | const char *cper_mem_err_type_str(unsigned int); | ||
| 398 | void cper_print_bits(const char *prefix, unsigned int bits, | 426 | void cper_print_bits(const char *prefix, unsigned int bits, |
| 399 | const char * const strs[], unsigned int strs_size); | 427 | const char * const strs[], unsigned int strs_size); |
| 428 | void cper_mem_err_pack(const struct cper_sec_mem_err *, | ||
| 429 | struct cper_mem_err_compact *); | ||
| 430 | const char *cper_mem_err_unpack(struct trace_seq *, | ||
| 431 | struct cper_mem_err_compact *); | ||
| 400 | 432 | ||
| 401 | #endif | 433 | #endif |
diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 447775ee2c4b..1d2a6ab6b8bb 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h | |||
| @@ -63,4 +63,8 @@ extern int proc_dowatchdog(struct ctl_table *, int , | |||
| 63 | void __user *, size_t *, loff_t *); | 63 | void __user *, size_t *, loff_t *); |
| 64 | #endif | 64 | #endif |
| 65 | 65 | ||
| 66 | #ifdef CONFIG_HAVE_ACPI_APEI_NMI | ||
| 67 | #include <asm/nmi.h> | ||
| 68 | #endif | ||
| 69 | |||
| 66 | #endif | 70 | #endif |
diff --git a/include/linux/ras.h b/include/linux/ras.h new file mode 100644 index 000000000000..2aceeafd6fe5 --- /dev/null +++ b/include/linux/ras.h | |||
| @@ -0,0 +1,14 @@ | |||
| 1 | #ifndef __RAS_H__ | ||
| 2 | #define __RAS_H__ | ||
| 3 | |||
| 4 | #ifdef CONFIG_DEBUG_FS | ||
| 5 | int ras_userspace_consumers(void); | ||
| 6 | void ras_debugfs_init(void); | ||
| 7 | int ras_add_daemon_trace(void); | ||
| 8 | #else | ||
| 9 | static inline int ras_userspace_consumers(void) { return 0; } | ||
| 10 | static inline void ras_debugfs_init(void) { return; } | ||
| 11 | static inline int ras_add_daemon_trace(void) { return 0; } | ||
| 12 | #endif | ||
| 13 | |||
| 14 | #endif | ||
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 21cdb0b7b0fb..47da53c27ffa 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h | |||
| @@ -8,6 +8,71 @@ | |||
| 8 | #include <linux/tracepoint.h> | 8 | #include <linux/tracepoint.h> |
| 9 | #include <linux/edac.h> | 9 | #include <linux/edac.h> |
| 10 | #include <linux/ktime.h> | 10 | #include <linux/ktime.h> |
| 11 | #include <linux/aer.h> | ||
| 12 | #include <linux/cper.h> | ||
| 13 | |||
| 14 | /* | ||
| 15 | * MCE Extended Error Log trace event | ||
| 16 | * | ||
| 17 | * These events are generated when hardware detects a corrected or | ||
| 18 | * uncorrected event. | ||
| 19 | */ | ||
| 20 | |||
| 21 | /* memory trace event */ | ||
| 22 | |||
| 23 | #if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE) | ||
| 24 | TRACE_EVENT(extlog_mem_event, | ||
| 25 | TP_PROTO(struct cper_sec_mem_err *mem, | ||
| 26 | u32 err_seq, | ||
| 27 | const uuid_le *fru_id, | ||
| 28 | const char *fru_text, | ||
| 29 | u8 sev), | ||
| 30 | |||
| 31 | TP_ARGS(mem, err_seq, fru_id, fru_text, sev), | ||
| 32 | |||
| 33 | TP_STRUCT__entry( | ||
| 34 | __field(u32, err_seq) | ||
| 35 | __field(u8, etype) | ||
| 36 | __field(u8, sev) | ||
| 37 | __field(u64, pa) | ||
| 38 | __field(u8, pa_mask_lsb) | ||
| 39 | __field_struct(uuid_le, fru_id) | ||
| 40 | __string(fru_text, fru_text) | ||
| 41 | __field_struct(struct cper_mem_err_compact, data) | ||
| 42 | ), | ||
| 43 | |||
| 44 | TP_fast_assign( | ||
| 45 | __entry->err_seq = err_seq; | ||
| 46 | if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) | ||
| 47 | __entry->etype = mem->error_type; | ||
| 48 | else | ||
| 49 | __entry->etype = ~0; | ||
| 50 | __entry->sev = sev; | ||
| 51 | if (mem->validation_bits & CPER_MEM_VALID_PA) | ||
| 52 | __entry->pa = mem->physical_addr; | ||
| 53 | else | ||
| 54 | __entry->pa = ~0ull; | ||
| 55 | |||
| 56 | if (mem->validation_bits & CPER_MEM_VALID_PA_MASK) | ||
| 57 | __entry->pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask); | ||
| 58 | else | ||
| 59 | __entry->pa_mask_lsb = ~0; | ||
| 60 | __entry->fru_id = *fru_id; | ||
| 61 | __assign_str(fru_text, fru_text); | ||
| 62 | cper_mem_err_pack(mem, &__entry->data); | ||
| 63 | ), | ||
| 64 | |||
| 65 | TP_printk("{%d} %s error: %s physical addr: %016llx (mask lsb: %x) %sFRU: %pUl %.20s", | ||
| 66 | __entry->err_seq, | ||
| 67 | cper_severity_str(__entry->sev), | ||
| 68 | cper_mem_err_type_str(__entry->etype), | ||
| 69 | __entry->pa, | ||
| 70 | __entry->pa_mask_lsb, | ||
| 71 | cper_mem_err_unpack(p, &__entry->data), | ||
| 72 | &__entry->fru_id, | ||
| 73 | __get_str(fru_text)) | ||
| 74 | ); | ||
| 75 | #endif | ||
| 11 | 76 | ||
| 12 | /* | 77 | /* |
| 13 | * Hardware Events Report | 78 | * Hardware Events Report |
| @@ -94,6 +159,69 @@ TRACE_EVENT(mc_event, | |||
| 94 | __get_str(driver_detail)) | 159 | __get_str(driver_detail)) |
| 95 | ); | 160 | ); |
| 96 | 161 | ||
| 162 | /* | ||
| 163 | * PCIe AER Trace event | ||
| 164 | * | ||
| 165 | * These events are generated when hardware detects a corrected or | ||
| 166 | * uncorrected event on a PCIe device. The event report has | ||
| 167 | * the following structure: | ||
| 168 | * | ||
| 169 | * char * dev_name - The name of the slot where the device resides | ||
| 170 | * ([domain:]bus:device.function). | ||
| 171 | * u32 status - Either the correctable or uncorrectable register | ||
| 172 | * indicating what error or errors have been seen | ||
| 173 | * u8 severity - error severity 0:NONFATAL 1:FATAL 2:CORRECTED | ||
| 174 | */ | ||
| 175 | |||
| 176 | #define aer_correctable_errors \ | ||
| 177 | {BIT(0), "Receiver Error"}, \ | ||
| 178 | {BIT(6), "Bad TLP"}, \ | ||
| 179 | {BIT(7), "Bad DLLP"}, \ | ||
| 180 | {BIT(8), "RELAY_NUM Rollover"}, \ | ||
| 181 | {BIT(12), "Replay Timer Timeout"}, \ | ||
| 182 | {BIT(13), "Advisory Non-Fatal"} | ||
| 183 | |||
| 184 | #define aer_uncorrectable_errors \ | ||
| 185 | {BIT(4), "Data Link Protocol"}, \ | ||
| 186 | {BIT(12), "Poisoned TLP"}, \ | ||
| 187 | {BIT(13), "Flow Control Protocol"}, \ | ||
| 188 | {BIT(14), "Completion Timeout"}, \ | ||
| 189 | {BIT(15), "Completer Abort"}, \ | ||
| 190 | {BIT(16), "Unexpected Completion"}, \ | ||
| 191 | {BIT(17), "Receiver Overflow"}, \ | ||
| 192 | {BIT(18), "Malformed TLP"}, \ | ||
| 193 | {BIT(19), "ECRC"}, \ | ||
| 194 | {BIT(20), "Unsupported Request"} | ||
| 195 | |||
| 196 | TRACE_EVENT(aer_event, | ||
| 197 | TP_PROTO(const char *dev_name, | ||
| 198 | const u32 status, | ||
| 199 | const u8 severity), | ||
| 200 | |||
| 201 | TP_ARGS(dev_name, status, severity), | ||
| 202 | |||
| 203 | TP_STRUCT__entry( | ||
| 204 | __string( dev_name, dev_name ) | ||
| 205 | __field( u32, status ) | ||
| 206 | __field( u8, severity ) | ||
| 207 | ), | ||
| 208 | |||
| 209 | TP_fast_assign( | ||
| 210 | __assign_str(dev_name, dev_name); | ||
| 211 | __entry->status = status; | ||
| 212 | __entry->severity = severity; | ||
| 213 | ), | ||
| 214 | |||
| 215 | TP_printk("%s PCIe Bus Error: severity=%s, %s\n", | ||
| 216 | __get_str(dev_name), | ||
| 217 | __entry->severity == AER_CORRECTABLE ? "Corrected" : | ||
| 218 | __entry->severity == AER_FATAL ? | ||
| 219 | "Fatal" : "Uncorrected, non-fatal", | ||
| 220 | __entry->severity == AER_CORRECTABLE ? | ||
| 221 | __print_flags(__entry->status, "|", aer_correctable_errors) : | ||
| 222 | __print_flags(__entry->status, "|", aer_uncorrectable_errors)) | ||
| 223 | ); | ||
| 224 | |||
| 97 | #endif /* _TRACE_HW_EVENT_MC_H */ | 225 | #endif /* _TRACE_HW_EVENT_MC_H */ |
| 98 | 226 | ||
| 99 | /* This part must be outside protection */ | 227 | /* This part must be outside protection */ |
diff --git a/include/trace/events/ras.h b/include/trace/events/ras.h deleted file mode 100644 index 1c875ad1ee5f..000000000000 --- a/include/trace/events/ras.h +++ /dev/null | |||
| @@ -1,77 +0,0 @@ | |||
| 1 | #undef TRACE_SYSTEM | ||
| 2 | #define TRACE_SYSTEM ras | ||
| 3 | |||
| 4 | #if !defined(_TRACE_AER_H) || defined(TRACE_HEADER_MULTI_READ) | ||
| 5 | #define _TRACE_AER_H | ||
| 6 | |||
| 7 | #include <linux/tracepoint.h> | ||
| 8 | #include <linux/aer.h> | ||
| 9 | |||
| 10 | |||
| 11 | /* | ||
| 12 | * PCIe AER Trace event | ||
| 13 | * | ||
| 14 | * These events are generated when hardware detects a corrected or | ||
| 15 | * uncorrected event on a PCIe device. The event report has | ||
| 16 | * the following structure: | ||
| 17 | * | ||
| 18 | * char * dev_name - The name of the slot where the device resides | ||
| 19 | * ([domain:]bus:device.function). | ||
| 20 | * u32 status - Either the correctable or uncorrectable register | ||
| 21 | * indicating what error or errors have been seen | ||
| 22 | * u8 severity - error severity 0:NONFATAL 1:FATAL 2:CORRECTED | ||
| 23 | */ | ||
| 24 | |||
| 25 | #define aer_correctable_errors \ | ||
| 26 | {BIT(0), "Receiver Error"}, \ | ||
| 27 | {BIT(6), "Bad TLP"}, \ | ||
| 28 | {BIT(7), "Bad DLLP"}, \ | ||
| 29 | {BIT(8), "RELAY_NUM Rollover"}, \ | ||
| 30 | {BIT(12), "Replay Timer Timeout"}, \ | ||
| 31 | {BIT(13), "Advisory Non-Fatal"} | ||
| 32 | |||
| 33 | #define aer_uncorrectable_errors \ | ||
| 34 | {BIT(4), "Data Link Protocol"}, \ | ||
| 35 | {BIT(12), "Poisoned TLP"}, \ | ||
| 36 | {BIT(13), "Flow Control Protocol"}, \ | ||
| 37 | {BIT(14), "Completion Timeout"}, \ | ||
| 38 | {BIT(15), "Completer Abort"}, \ | ||
| 39 | {BIT(16), "Unexpected Completion"}, \ | ||
| 40 | {BIT(17), "Receiver Overflow"}, \ | ||
| 41 | {BIT(18), "Malformed TLP"}, \ | ||
| 42 | {BIT(19), "ECRC"}, \ | ||
| 43 | {BIT(20), "Unsupported Request"} | ||
| 44 | |||
| 45 | TRACE_EVENT(aer_event, | ||
| 46 | TP_PROTO(const char *dev_name, | ||
| 47 | const u32 status, | ||
| 48 | const u8 severity), | ||
| 49 | |||
| 50 | TP_ARGS(dev_name, status, severity), | ||
| 51 | |||
| 52 | TP_STRUCT__entry( | ||
| 53 | __string( dev_name, dev_name ) | ||
| 54 | __field( u32, status ) | ||
| 55 | __field( u8, severity ) | ||
| 56 | ), | ||
| 57 | |||
| 58 | TP_fast_assign( | ||
| 59 | __assign_str(dev_name, dev_name); | ||
| 60 | __entry->status = status; | ||
| 61 | __entry->severity = severity; | ||
| 62 | ), | ||
| 63 | |||
| 64 | TP_printk("%s PCIe Bus Error: severity=%s, %s\n", | ||
| 65 | __get_str(dev_name), | ||
| 66 | __entry->severity == AER_CORRECTABLE ? "Corrected" : | ||
| 67 | __entry->severity == AER_FATAL ? | ||
| 68 | "Fatal" : "Uncorrected, non-fatal", | ||
| 69 | __entry->severity == AER_CORRECTABLE ? | ||
| 70 | __print_flags(__entry->status, "|", aer_correctable_errors) : | ||
| 71 | __print_flags(__entry->status, "|", aer_uncorrectable_errors)) | ||
| 72 | ); | ||
| 73 | |||
| 74 | #endif /* _TRACE_AER_H */ | ||
| 75 | |||
| 76 | /* This part must be outside protection */ | ||
| 77 | #include <trace/define_trace.h> | ||
