diff options
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck/mce.c')
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 179 |
1 files changed, 126 insertions, 53 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a11ae2e9e91..ad573d8baf10 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -540,6 +540,27 @@ static void mce_report_event(struct pt_regs *regs) | |||
540 | irq_work_queue(&__get_cpu_var(mce_irq_work)); | 540 | irq_work_queue(&__get_cpu_var(mce_irq_work)); |
541 | } | 541 | } |
542 | 542 | ||
543 | /* | ||
544 | * Read ADDR and MISC registers. | ||
545 | */ | ||
546 | static void mce_read_aux(struct mce *m, int i) | ||
547 | { | ||
548 | if (m->status & MCI_STATUS_MISCV) | ||
549 | m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); | ||
550 | if (m->status & MCI_STATUS_ADDRV) { | ||
551 | m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); | ||
552 | |||
553 | /* | ||
554 | * Mask the reported address by the reported granularity. | ||
555 | */ | ||
556 | if (mce_ser && (m->status & MCI_STATUS_MISCV)) { | ||
557 | u8 shift = MCI_MISC_ADDR_LSB(m->misc); | ||
558 | m->addr >>= shift; | ||
559 | m->addr <<= shift; | ||
560 | } | ||
561 | } | ||
562 | } | ||
563 | |||
543 | DEFINE_PER_CPU(unsigned, mce_poll_count); | 564 | DEFINE_PER_CPU(unsigned, mce_poll_count); |
544 | 565 | ||
545 | /* | 566 | /* |
@@ -590,10 +611,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
590 | (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) | 611 | (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) |
591 | continue; | 612 | continue; |
592 | 613 | ||
593 | if (m.status & MCI_STATUS_MISCV) | 614 | mce_read_aux(&m, i); |
594 | m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); | ||
595 | if (m.status & MCI_STATUS_ADDRV) | ||
596 | m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); | ||
597 | 615 | ||
598 | if (!(flags & MCP_TIMESTAMP)) | 616 | if (!(flags & MCP_TIMESTAMP)) |
599 | m.tsc = 0; | 617 | m.tsc = 0; |
@@ -917,6 +935,49 @@ static void mce_clear_state(unsigned long *toclear) | |||
917 | } | 935 | } |
918 | 936 | ||
919 | /* | 937 | /* |
938 | * Need to save faulting physical address associated with a process | ||
939 | * in the machine check handler some place where we can grab it back | ||
940 | * later in mce_notify_process() | ||
941 | */ | ||
942 | #define MCE_INFO_MAX 16 | ||
943 | |||
944 | struct mce_info { | ||
945 | atomic_t inuse; | ||
946 | struct task_struct *t; | ||
947 | __u64 paddr; | ||
948 | } mce_info[MCE_INFO_MAX]; | ||
949 | |||
950 | static void mce_save_info(__u64 addr) | ||
951 | { | ||
952 | struct mce_info *mi; | ||
953 | |||
954 | for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { | ||
955 | if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { | ||
956 | mi->t = current; | ||
957 | mi->paddr = addr; | ||
958 | return; | ||
959 | } | ||
960 | } | ||
961 | |||
962 | mce_panic("Too many concurrent recoverable errors", NULL, NULL); | ||
963 | } | ||
964 | |||
965 | static struct mce_info *mce_find_info(void) | ||
966 | { | ||
967 | struct mce_info *mi; | ||
968 | |||
969 | for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) | ||
970 | if (atomic_read(&mi->inuse) && mi->t == current) | ||
971 | return mi; | ||
972 | return NULL; | ||
973 | } | ||
974 | |||
975 | static void mce_clear_info(struct mce_info *mi) | ||
976 | { | ||
977 | atomic_set(&mi->inuse, 0); | ||
978 | } | ||
979 | |||
980 | /* | ||
920 | * The actual machine check handler. This only handles real | 981 | * The actual machine check handler. This only handles real |
921 | * exceptions when something got corrupted coming in through int 18. | 982 | * exceptions when something got corrupted coming in through int 18. |
922 | * | 983 | * |
@@ -969,7 +1030,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
969 | barrier(); | 1030 | barrier(); |
970 | 1031 | ||
971 | /* | 1032 | /* |
972 | * When no restart IP must always kill or panic. | 1033 | * When no restart IP might need to kill or panic. |
1034 | * Assume the worst for now, but if we find the | ||
1035 | * severity is MCE_AR_SEVERITY we have other options. | ||
973 | */ | 1036 | */ |
974 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | 1037 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) |
975 | kill_it = 1; | 1038 | kill_it = 1; |
@@ -1023,16 +1086,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1023 | continue; | 1086 | continue; |
1024 | } | 1087 | } |
1025 | 1088 | ||
1026 | /* | 1089 | mce_read_aux(&m, i); |
1027 | * Kill on action required. | ||
1028 | */ | ||
1029 | if (severity == MCE_AR_SEVERITY) | ||
1030 | kill_it = 1; | ||
1031 | |||
1032 | if (m.status & MCI_STATUS_MISCV) | ||
1033 | m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); | ||
1034 | if (m.status & MCI_STATUS_ADDRV) | ||
1035 | m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); | ||
1036 | 1090 | ||
1037 | /* | 1091 | /* |
1038 | * Action optional error. Queue address for later processing. | 1092 | * Action optional error. Queue address for later processing. |
@@ -1052,6 +1106,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1052 | } | 1106 | } |
1053 | } | 1107 | } |
1054 | 1108 | ||
1109 | /* mce_clear_state will clear *final, save locally for use later */ | ||
1110 | m = *final; | ||
1111 | |||
1055 | if (!no_way_out) | 1112 | if (!no_way_out) |
1056 | mce_clear_state(toclear); | 1113 | mce_clear_state(toclear); |
1057 | 1114 | ||
@@ -1063,27 +1120,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1063 | no_way_out = worst >= MCE_PANIC_SEVERITY; | 1120 | no_way_out = worst >= MCE_PANIC_SEVERITY; |
1064 | 1121 | ||
1065 | /* | 1122 | /* |
1066 | * If we have decided that we just CAN'T continue, and the user | 1123 | * At insane "tolerant" levels we take no action. Otherwise |
1067 | * has not set tolerant to an insane level, give up and die. | 1124 | * we only die if we have no other choice. For less serious |
1068 | * | 1125 | * issues we try to recover, or limit damage to the current |
1069 | * This is mainly used in the case when the system doesn't | 1126 | * process. |
1070 | * support MCE broadcasting or it has been disabled. | ||
1071 | */ | ||
1072 | if (no_way_out && tolerant < 3) | ||
1073 | mce_panic("Fatal machine check on current CPU", final, msg); | ||
1074 | |||
1075 | /* | ||
1076 | * If the error seems to be unrecoverable, something should be | ||
1077 | * done. Try to kill as little as possible. If we can kill just | ||
1078 | * one task, do that. If the user has set the tolerance very | ||
1079 | * high, don't try to do anything at all. | ||
1080 | */ | 1127 | */ |
1081 | 1128 | if (tolerant < 3) { | |
1082 | if (kill_it && tolerant < 3) | 1129 | if (no_way_out) |
1083 | force_sig(SIGBUS, current); | 1130 | mce_panic("Fatal machine check on current CPU", &m, msg); |
1084 | 1131 | if (worst == MCE_AR_SEVERITY) { | |
1085 | /* notify userspace ASAP */ | 1132 | /* schedule action before return to userland */ |
1086 | set_thread_flag(TIF_MCE_NOTIFY); | 1133 | mce_save_info(m.addr); |
1134 | set_thread_flag(TIF_MCE_NOTIFY); | ||
1135 | } else if (kill_it) { | ||
1136 | force_sig(SIGBUS, current); | ||
1137 | } | ||
1138 | } | ||
1087 | 1139 | ||
1088 | if (worst > 0) | 1140 | if (worst > 0) |
1089 | mce_report_event(regs); | 1141 | mce_report_event(regs); |
@@ -1094,34 +1146,57 @@ out: | |||
1094 | } | 1146 | } |
1095 | EXPORT_SYMBOL_GPL(do_machine_check); | 1147 | EXPORT_SYMBOL_GPL(do_machine_check); |
1096 | 1148 | ||
1097 | /* dummy to break dependency. actual code is in mm/memory-failure.c */ | 1149 | #ifndef CONFIG_MEMORY_FAILURE |
1098 | void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) | 1150 | int memory_failure(unsigned long pfn, int vector, int flags) |
1099 | { | 1151 | { |
1100 | printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); | 1152 | /* mce_severity() should not hand us an ACTION_REQUIRED error */ |
1153 | BUG_ON(flags & MF_ACTION_REQUIRED); | ||
1154 | printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" | ||
1155 | "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); | ||
1156 | |||
1157 | return 0; | ||
1101 | } | 1158 | } |
1159 | #endif | ||
1102 | 1160 | ||
1103 | /* | 1161 | /* |
1104 | * Called after mce notification in process context. This code | 1162 | * Called in process context that interrupted by MCE and marked with |
1105 | * is allowed to sleep. Call the high level VM handler to process | 1163 | * TIF_MCE_NOTIFY, just before returning to erroneous userland. |
1106 | * any corrupted pages. | 1164 | * This code is allowed to sleep. |
1107 | * Assume that the work queue code only calls this one at a time | 1165 | * Attempt possible recovery such as calling the high level VM handler to |
1108 | * per CPU. | 1166 | * process any corrupted pages, and kill/signal current process if required. |
1109 | * Note we don't disable preemption, so this code might run on the wrong | 1167 | * Action required errors are handled here. |
1110 | * CPU. In this case the event is picked up by the scheduled work queue. | ||
1111 | * This is merely a fast path to expedite processing in some common | ||
1112 | * cases. | ||
1113 | */ | 1168 | */ |
1114 | void mce_notify_process(void) | 1169 | void mce_notify_process(void) |
1115 | { | 1170 | { |
1116 | unsigned long pfn; | 1171 | unsigned long pfn; |
1117 | mce_notify_irq(); | 1172 | struct mce_info *mi = mce_find_info(); |
1118 | while (mce_ring_get(&pfn)) | 1173 | |
1119 | memory_failure(pfn, MCE_VECTOR); | 1174 | if (!mi) |
1175 | mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); | ||
1176 | pfn = mi->paddr >> PAGE_SHIFT; | ||
1177 | |||
1178 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
1179 | |||
1180 | pr_err("Uncorrected hardware memory error in user-access at %llx", | ||
1181 | mi->paddr); | ||
1182 | if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { | ||
1183 | pr_err("Memory error not recovered"); | ||
1184 | force_sig(SIGBUS, current); | ||
1185 | } | ||
1186 | mce_clear_info(mi); | ||
1120 | } | 1187 | } |
1121 | 1188 | ||
1189 | /* | ||
1190 | * Action optional processing happens here (picking up | ||
1191 | * from the list of faulting pages that do_machine_check() | ||
1192 | * placed into the "ring"). | ||
1193 | */ | ||
1122 | static void mce_process_work(struct work_struct *dummy) | 1194 | static void mce_process_work(struct work_struct *dummy) |
1123 | { | 1195 | { |
1124 | mce_notify_process(); | 1196 | unsigned long pfn; |
1197 | |||
1198 | while (mce_ring_get(&pfn)) | ||
1199 | memory_failure(pfn, MCE_VECTOR, 0); | ||
1125 | } | 1200 | } |
1126 | 1201 | ||
1127 | #ifdef CONFIG_X86_MCE_INTEL | 1202 | #ifdef CONFIG_X86_MCE_INTEL |
@@ -1211,8 +1286,6 @@ int mce_notify_irq(void) | |||
1211 | /* Not more than two messages every minute */ | 1286 | /* Not more than two messages every minute */ |
1212 | static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); | 1287 | static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); |
1213 | 1288 | ||
1214 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
1215 | |||
1216 | if (test_and_clear_bit(0, &mce_need_notify)) { | 1289 | if (test_and_clear_bit(0, &mce_need_notify)) { |
1217 | /* wake processes polling /dev/mcelog */ | 1290 | /* wake processes polling /dev/mcelog */ |
1218 | wake_up_interruptible(&mce_chrdev_wait); | 1291 | wake_up_interruptible(&mce_chrdev_wait); |