diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-22 12:42:04 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-22 12:42:04 -0400 |
commit | 754b9800779402924fffe456b49d557e15260cbf (patch) | |
tree | 0e0441eca766616fccd8fc37a3885397efc6063a | |
parent | 35cb8d9e18c0bb33b90d7e574abadbe23b65427d (diff) | |
parent | ea281a9ebaba3287130dbe15bb0aad6f798bb06b (diff) |
Merge branch 'x86-mce-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull MCE changes from Ingo Molnar.
* 'x86-mce-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mce: Fix return value of mce_chrdev_read() when erst is disabled
x86/mce: Convert static array of pointers to per-cpu variables
x86/mce: Replace hard coded hex constants with symbolic defines
x86/mce: Recognise machine check bank signature for data path error
x86/mce: Handle "action required" errors
x86/mce: Add mechanism to safely save information in MCE handler
x86/mce: Create helper function to save addr/misc when needed
HWPOISON: Add code to handle "action required" errors.
HWPOISON: Clean up memory_failure() vs. __memory_failure()
-rw-r--r-- | arch/x86/include/asm/mce.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 193 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce_amd.c | 9 | ||||
-rw-r--r-- | drivers/base/memory.c | 2 | ||||
-rw-r--r-- | include/linux/mm.h | 4 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 4 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memory-failure.c | 96 |
9 files changed, 221 insertions, 117 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6aefb14cbbc5..441520e4174f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
@@ -151,7 +151,7 @@ static inline void enable_p5_mce(void) {} | |||
151 | 151 | ||
152 | void mce_setup(struct mce *m); | 152 | void mce_setup(struct mce *m); |
153 | void mce_log(struct mce *m); | 153 | void mce_log(struct mce *m); |
154 | extern struct device *mce_device[CONFIG_NR_CPUS]; | 154 | DECLARE_PER_CPU(struct device *, mce_device); |
155 | 155 | ||
156 | /* | 156 | /* |
157 | * Maximum banks number. | 157 | * Maximum banks number. |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 7395d5f4272d..0c82091b1652 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c | |||
@@ -54,7 +54,14 @@ static struct severity { | |||
54 | #define MASK(x, y) .mask = x, .result = y | 54 | #define MASK(x, y) .mask = x, .result = y |
55 | #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) | 55 | #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) |
56 | #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) | 56 | #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) |
57 | #define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) | ||
57 | #define MCACOD 0xffff | 58 | #define MCACOD 0xffff |
59 | /* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ | ||
60 | #define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ | ||
61 | #define MCACOD_SCRUBMSK 0xfff0 | ||
62 | #define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ | ||
63 | #define MCACOD_DATA 0x0134 /* Data Load */ | ||
64 | #define MCACOD_INSTR 0x0150 /* Instruction Fetch */ | ||
58 | 65 | ||
59 | MCESEV( | 66 | MCESEV( |
60 | NO, "Invalid", | 67 | NO, "Invalid", |
@@ -102,11 +109,24 @@ static struct severity { | |||
102 | SER, BITCLR(MCI_STATUS_S) | 109 | SER, BITCLR(MCI_STATUS_S) |
103 | ), | 110 | ), |
104 | 111 | ||
105 | /* AR add known MCACODs here */ | ||
106 | MCESEV( | 112 | MCESEV( |
107 | PANIC, "Action required with lost events", | 113 | PANIC, "Action required with lost events", |
108 | SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) | 114 | SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) |
109 | ), | 115 | ), |
116 | |||
117 | /* known AR MCACODs: */ | ||
118 | #ifdef CONFIG_MEMORY_FAILURE | ||
119 | MCESEV( | ||
120 | KEEP, "HT thread notices Action required: data load error", | ||
121 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), | ||
122 | MCGMASK(MCG_STATUS_EIPV, 0) | ||
123 | ), | ||
124 | MCESEV( | ||
125 | AR, "Action required: data load error", | ||
126 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), | ||
127 | USER | ||
128 | ), | ||
129 | #endif | ||
110 | MCESEV( | 130 | MCESEV( |
111 | PANIC, "Action required: unknown MCACOD", | 131 | PANIC, "Action required: unknown MCACOD", |
112 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) | 132 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) |
@@ -115,11 +135,11 @@ static struct severity { | |||
115 | /* known AO MCACODs: */ | 135 | /* known AO MCACODs: */ |
116 | MCESEV( | 136 | MCESEV( |
117 | AO, "Action optional: memory scrubbing error", | 137 | AO, "Action optional: memory scrubbing error", |
118 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0) | 138 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) |
119 | ), | 139 | ), |
120 | MCESEV( | 140 | MCESEV( |
121 | AO, "Action optional: last level cache writeback error", | 141 | AO, "Action optional: last level cache writeback error", |
122 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a) | 142 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) |
123 | ), | 143 | ), |
124 | MCESEV( | 144 | MCESEV( |
125 | SOME, "Action optional: unknown MCACOD", | 145 | SOME, "Action optional: unknown MCACOD", |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a11ae2e9e91..c614bd4de0f3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -540,6 +540,27 @@ static void mce_report_event(struct pt_regs *regs) | |||
540 | irq_work_queue(&__get_cpu_var(mce_irq_work)); | 540 | irq_work_queue(&__get_cpu_var(mce_irq_work)); |
541 | } | 541 | } |
542 | 542 | ||
543 | /* | ||
544 | * Read ADDR and MISC registers. | ||
545 | */ | ||
546 | static void mce_read_aux(struct mce *m, int i) | ||
547 | { | ||
548 | if (m->status & MCI_STATUS_MISCV) | ||
549 | m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); | ||
550 | if (m->status & MCI_STATUS_ADDRV) { | ||
551 | m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); | ||
552 | |||
553 | /* | ||
554 | * Mask the reported address by the reported granularity. | ||
555 | */ | ||
556 | if (mce_ser && (m->status & MCI_STATUS_MISCV)) { | ||
557 | u8 shift = MCI_MISC_ADDR_LSB(m->misc); | ||
558 | m->addr >>= shift; | ||
559 | m->addr <<= shift; | ||
560 | } | ||
561 | } | ||
562 | } | ||
563 | |||
543 | DEFINE_PER_CPU(unsigned, mce_poll_count); | 564 | DEFINE_PER_CPU(unsigned, mce_poll_count); |
544 | 565 | ||
545 | /* | 566 | /* |
@@ -590,10 +611,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
590 | (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) | 611 | (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) |
591 | continue; | 612 | continue; |
592 | 613 | ||
593 | if (m.status & MCI_STATUS_MISCV) | 614 | mce_read_aux(&m, i); |
594 | m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); | ||
595 | if (m.status & MCI_STATUS_ADDRV) | ||
596 | m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); | ||
597 | 615 | ||
598 | if (!(flags & MCP_TIMESTAMP)) | 616 | if (!(flags & MCP_TIMESTAMP)) |
599 | m.tsc = 0; | 617 | m.tsc = 0; |
@@ -917,6 +935,49 @@ static void mce_clear_state(unsigned long *toclear) | |||
917 | } | 935 | } |
918 | 936 | ||
919 | /* | 937 | /* |
938 | * Need to save faulting physical address associated with a process | ||
939 | * in the machine check handler some place where we can grab it back | ||
940 | * later in mce_notify_process() | ||
941 | */ | ||
942 | #define MCE_INFO_MAX 16 | ||
943 | |||
944 | struct mce_info { | ||
945 | atomic_t inuse; | ||
946 | struct task_struct *t; | ||
947 | __u64 paddr; | ||
948 | } mce_info[MCE_INFO_MAX]; | ||
949 | |||
950 | static void mce_save_info(__u64 addr) | ||
951 | { | ||
952 | struct mce_info *mi; | ||
953 | |||
954 | for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { | ||
955 | if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { | ||
956 | mi->t = current; | ||
957 | mi->paddr = addr; | ||
958 | return; | ||
959 | } | ||
960 | } | ||
961 | |||
962 | mce_panic("Too many concurrent recoverable errors", NULL, NULL); | ||
963 | } | ||
964 | |||
965 | static struct mce_info *mce_find_info(void) | ||
966 | { | ||
967 | struct mce_info *mi; | ||
968 | |||
969 | for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) | ||
970 | if (atomic_read(&mi->inuse) && mi->t == current) | ||
971 | return mi; | ||
972 | return NULL; | ||
973 | } | ||
974 | |||
975 | static void mce_clear_info(struct mce_info *mi) | ||
976 | { | ||
977 | atomic_set(&mi->inuse, 0); | ||
978 | } | ||
979 | |||
980 | /* | ||
920 | * The actual machine check handler. This only handles real | 981 | * The actual machine check handler. This only handles real |
921 | * exceptions when something got corrupted coming in through int 18. | 982 | * exceptions when something got corrupted coming in through int 18. |
922 | * | 983 | * |
@@ -969,7 +1030,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
969 | barrier(); | 1030 | barrier(); |
970 | 1031 | ||
971 | /* | 1032 | /* |
972 | * When no restart IP must always kill or panic. | 1033 | * When no restart IP might need to kill or panic. |
1034 | * Assume the worst for now, but if we find the | ||
1035 | * severity is MCE_AR_SEVERITY we have other options. | ||
973 | */ | 1036 | */ |
974 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | 1037 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) |
975 | kill_it = 1; | 1038 | kill_it = 1; |
@@ -1023,16 +1086,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1023 | continue; | 1086 | continue; |
1024 | } | 1087 | } |
1025 | 1088 | ||
1026 | /* | 1089 | mce_read_aux(&m, i); |
1027 | * Kill on action required. | ||
1028 | */ | ||
1029 | if (severity == MCE_AR_SEVERITY) | ||
1030 | kill_it = 1; | ||
1031 | |||
1032 | if (m.status & MCI_STATUS_MISCV) | ||
1033 | m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); | ||
1034 | if (m.status & MCI_STATUS_ADDRV) | ||
1035 | m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); | ||
1036 | 1090 | ||
1037 | /* | 1091 | /* |
1038 | * Action optional error. Queue address for later processing. | 1092 | * Action optional error. Queue address for later processing. |
@@ -1052,6 +1106,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1052 | } | 1106 | } |
1053 | } | 1107 | } |
1054 | 1108 | ||
1109 | /* mce_clear_state will clear *final, save locally for use later */ | ||
1110 | m = *final; | ||
1111 | |||
1055 | if (!no_way_out) | 1112 | if (!no_way_out) |
1056 | mce_clear_state(toclear); | 1113 | mce_clear_state(toclear); |
1057 | 1114 | ||
@@ -1063,27 +1120,22 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1063 | no_way_out = worst >= MCE_PANIC_SEVERITY; | 1120 | no_way_out = worst >= MCE_PANIC_SEVERITY; |
1064 | 1121 | ||
1065 | /* | 1122 | /* |
1066 | * If we have decided that we just CAN'T continue, and the user | 1123 | * At insane "tolerant" levels we take no action. Otherwise |
1067 | * has not set tolerant to an insane level, give up and die. | 1124 | * we only die if we have no other choice. For less serious |
1068 | * | 1125 | * issues we try to recover, or limit damage to the current |
1069 | * This is mainly used in the case when the system doesn't | 1126 | * process. |
1070 | * support MCE broadcasting or it has been disabled. | ||
1071 | */ | ||
1072 | if (no_way_out && tolerant < 3) | ||
1073 | mce_panic("Fatal machine check on current CPU", final, msg); | ||
1074 | |||
1075 | /* | ||
1076 | * If the error seems to be unrecoverable, something should be | ||
1077 | * done. Try to kill as little as possible. If we can kill just | ||
1078 | * one task, do that. If the user has set the tolerance very | ||
1079 | * high, don't try to do anything at all. | ||
1080 | */ | 1127 | */ |
1081 | 1128 | if (tolerant < 3) { | |
1082 | if (kill_it && tolerant < 3) | 1129 | if (no_way_out) |
1083 | force_sig(SIGBUS, current); | 1130 | mce_panic("Fatal machine check on current CPU", &m, msg); |
1084 | 1131 | if (worst == MCE_AR_SEVERITY) { | |
1085 | /* notify userspace ASAP */ | 1132 | /* schedule action before return to userland */ |
1086 | set_thread_flag(TIF_MCE_NOTIFY); | 1133 | mce_save_info(m.addr); |
1134 | set_thread_flag(TIF_MCE_NOTIFY); | ||
1135 | } else if (kill_it) { | ||
1136 | force_sig(SIGBUS, current); | ||
1137 | } | ||
1138 | } | ||
1087 | 1139 | ||
1088 | if (worst > 0) | 1140 | if (worst > 0) |
1089 | mce_report_event(regs); | 1141 | mce_report_event(regs); |
@@ -1094,34 +1146,57 @@ out: | |||
1094 | } | 1146 | } |
1095 | EXPORT_SYMBOL_GPL(do_machine_check); | 1147 | EXPORT_SYMBOL_GPL(do_machine_check); |
1096 | 1148 | ||
1097 | /* dummy to break dependency. actual code is in mm/memory-failure.c */ | 1149 | #ifndef CONFIG_MEMORY_FAILURE |
1098 | void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) | 1150 | int memory_failure(unsigned long pfn, int vector, int flags) |
1099 | { | 1151 | { |
1100 | printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); | 1152 | /* mce_severity() should not hand us an ACTION_REQUIRED error */ |
1153 | BUG_ON(flags & MF_ACTION_REQUIRED); | ||
1154 | printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" | ||
1155 | "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); | ||
1156 | |||
1157 | return 0; | ||
1101 | } | 1158 | } |
1159 | #endif | ||
1102 | 1160 | ||
1103 | /* | 1161 | /* |
1104 | * Called after mce notification in process context. This code | 1162 | * Called in process context that interrupted by MCE and marked with |
1105 | * is allowed to sleep. Call the high level VM handler to process | 1163 | * TIF_MCE_NOTIFY, just before returning to erroneous userland. |
1106 | * any corrupted pages. | 1164 | * This code is allowed to sleep. |
1107 | * Assume that the work queue code only calls this one at a time | 1165 | * Attempt possible recovery such as calling the high level VM handler to |
1108 | * per CPU. | 1166 | * process any corrupted pages, and kill/signal current process if required. |
1109 | * Note we don't disable preemption, so this code might run on the wrong | 1167 | * Action required errors are handled here. |
1110 | * CPU. In this case the event is picked up by the scheduled work queue. | ||
1111 | * This is merely a fast path to expedite processing in some common | ||
1112 | * cases. | ||
1113 | */ | 1168 | */ |
1114 | void mce_notify_process(void) | 1169 | void mce_notify_process(void) |
1115 | { | 1170 | { |
1116 | unsigned long pfn; | 1171 | unsigned long pfn; |
1117 | mce_notify_irq(); | 1172 | struct mce_info *mi = mce_find_info(); |
1118 | while (mce_ring_get(&pfn)) | 1173 | |
1119 | memory_failure(pfn, MCE_VECTOR); | 1174 | if (!mi) |
1175 | mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); | ||
1176 | pfn = mi->paddr >> PAGE_SHIFT; | ||
1177 | |||
1178 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
1179 | |||
1180 | pr_err("Uncorrected hardware memory error in user-access at %llx", | ||
1181 | mi->paddr); | ||
1182 | if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) { | ||
1183 | pr_err("Memory error not recovered"); | ||
1184 | force_sig(SIGBUS, current); | ||
1185 | } | ||
1186 | mce_clear_info(mi); | ||
1120 | } | 1187 | } |
1121 | 1188 | ||
1189 | /* | ||
1190 | * Action optional processing happens here (picking up | ||
1191 | * from the list of faulting pages that do_machine_check() | ||
1192 | * placed into the "ring"). | ||
1193 | */ | ||
1122 | static void mce_process_work(struct work_struct *dummy) | 1194 | static void mce_process_work(struct work_struct *dummy) |
1123 | { | 1195 | { |
1124 | mce_notify_process(); | 1196 | unsigned long pfn; |
1197 | |||
1198 | while (mce_ring_get(&pfn)) | ||
1199 | memory_failure(pfn, MCE_VECTOR, 0); | ||
1125 | } | 1200 | } |
1126 | 1201 | ||
1127 | #ifdef CONFIG_X86_MCE_INTEL | 1202 | #ifdef CONFIG_X86_MCE_INTEL |
@@ -1211,8 +1286,6 @@ int mce_notify_irq(void) | |||
1211 | /* Not more than two messages every minute */ | 1286 | /* Not more than two messages every minute */ |
1212 | static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); | 1287 | static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); |
1213 | 1288 | ||
1214 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
1215 | |||
1216 | if (test_and_clear_bit(0, &mce_need_notify)) { | 1289 | if (test_and_clear_bit(0, &mce_need_notify)) { |
1217 | /* wake processes polling /dev/mcelog */ | 1290 | /* wake processes polling /dev/mcelog */ |
1218 | wake_up_interruptible(&mce_chrdev_wait); | 1291 | wake_up_interruptible(&mce_chrdev_wait); |
@@ -1541,6 +1614,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize) | |||
1541 | /* Error or no more MCE record */ | 1614 | /* Error or no more MCE record */ |
1542 | if (rc <= 0) { | 1615 | if (rc <= 0) { |
1543 | mce_apei_read_done = 1; | 1616 | mce_apei_read_done = 1; |
1617 | /* | ||
1618 | * When ERST is disabled, mce_chrdev_read() should return | ||
1619 | * "no record" instead of "no device." | ||
1620 | */ | ||
1621 | if (rc == -ENODEV) | ||
1622 | return 0; | ||
1544 | return rc; | 1623 | return rc; |
1545 | } | 1624 | } |
1546 | rc = -EFAULT; | 1625 | rc = -EFAULT; |
@@ -1859,7 +1938,7 @@ static struct bus_type mce_subsys = { | |||
1859 | .dev_name = "machinecheck", | 1938 | .dev_name = "machinecheck", |
1860 | }; | 1939 | }; |
1861 | 1940 | ||
1862 | struct device *mce_device[CONFIG_NR_CPUS]; | 1941 | DEFINE_PER_CPU(struct device *, mce_device); |
1863 | 1942 | ||
1864 | __cpuinitdata | 1943 | __cpuinitdata |
1865 | void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); | 1944 | void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); |
@@ -2038,7 +2117,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) | |||
2038 | goto error2; | 2117 | goto error2; |
2039 | } | 2118 | } |
2040 | cpumask_set_cpu(cpu, mce_device_initialized); | 2119 | cpumask_set_cpu(cpu, mce_device_initialized); |
2041 | mce_device[cpu] = dev; | 2120 | per_cpu(mce_device, cpu) = dev; |
2042 | 2121 | ||
2043 | return 0; | 2122 | return 0; |
2044 | error2: | 2123 | error2: |
@@ -2055,7 +2134,7 @@ error: | |||
2055 | 2134 | ||
2056 | static __cpuinit void mce_device_remove(unsigned int cpu) | 2135 | static __cpuinit void mce_device_remove(unsigned int cpu) |
2057 | { | 2136 | { |
2058 | struct device *dev = mce_device[cpu]; | 2137 | struct device *dev = per_cpu(mce_device, cpu); |
2059 | int i; | 2138 | int i; |
2060 | 2139 | ||
2061 | if (!cpumask_test_cpu(cpu, mce_device_initialized)) | 2140 | if (!cpumask_test_cpu(cpu, mce_device_initialized)) |
@@ -2069,7 +2148,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) | |||
2069 | 2148 | ||
2070 | device_unregister(dev); | 2149 | device_unregister(dev); |
2071 | cpumask_clear_cpu(cpu, mce_device_initialized); | 2150 | cpumask_clear_cpu(cpu, mce_device_initialized); |
2072 | mce_device[cpu] = NULL; | 2151 | per_cpu(mce_device, cpu) = NULL; |
2073 | } | 2152 | } |
2074 | 2153 | ||
2075 | /* Make sure there are no machine checks on offlined CPUs. */ | 2154 | /* Make sure there are no machine checks on offlined CPUs. */ |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index e4eeaaf58a47..99b57179f912 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c | |||
@@ -523,7 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
523 | { | 523 | { |
524 | int i, err = 0; | 524 | int i, err = 0; |
525 | struct threshold_bank *b = NULL; | 525 | struct threshold_bank *b = NULL; |
526 | struct device *dev = mce_device[cpu]; | 526 | struct device *dev = per_cpu(mce_device, cpu); |
527 | char name[32]; | 527 | char name[32]; |
528 | 528 | ||
529 | sprintf(name, "threshold_bank%i", bank); | 529 | sprintf(name, "threshold_bank%i", bank); |
@@ -587,7 +587,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
587 | if (i == cpu) | 587 | if (i == cpu) |
588 | continue; | 588 | continue; |
589 | 589 | ||
590 | dev = mce_device[i]; | 590 | dev = per_cpu(mce_device, i); |
591 | if (dev) | 591 | if (dev) |
592 | err = sysfs_create_link(&dev->kobj,b->kobj, name); | 592 | err = sysfs_create_link(&dev->kobj,b->kobj, name); |
593 | if (err) | 593 | if (err) |
@@ -667,7 +667,8 @@ static void threshold_remove_bank(unsigned int cpu, int bank) | |||
667 | #ifdef CONFIG_SMP | 667 | #ifdef CONFIG_SMP |
668 | /* sibling symlink */ | 668 | /* sibling symlink */ |
669 | if (shared_bank[bank] && b->blocks->cpu != cpu) { | 669 | if (shared_bank[bank] && b->blocks->cpu != cpu) { |
670 | sysfs_remove_link(&mce_device[cpu]->kobj, name); | 670 | dev = per_cpu(mce_device, cpu); |
671 | sysfs_remove_link(&dev->kobj, name); | ||
671 | per_cpu(threshold_banks, cpu)[bank] = NULL; | 672 | per_cpu(threshold_banks, cpu)[bank] = NULL; |
672 | 673 | ||
673 | return; | 674 | return; |
@@ -679,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) | |||
679 | if (i == cpu) | 680 | if (i == cpu) |
680 | continue; | 681 | continue; |
681 | 682 | ||
682 | dev = mce_device[i]; | 683 | dev = per_cpu(mce_device, i); |
683 | if (dev) | 684 | if (dev) |
684 | sysfs_remove_link(&dev->kobj, name); | 685 | sysfs_remove_link(&dev->kobj, name); |
685 | per_cpu(threshold_banks, i)[bank] = NULL; | 686 | per_cpu(threshold_banks, i)[bank] = NULL; |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 9e60dbe9fd94..7dda4f790f00 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
@@ -466,7 +466,7 @@ store_hard_offline_page(struct device *dev, | |||
466 | if (strict_strtoull(buf, 0, &pfn) < 0) | 466 | if (strict_strtoull(buf, 0, &pfn) < 0) |
467 | return -EINVAL; | 467 | return -EINVAL; |
468 | pfn >>= PAGE_SHIFT; | 468 | pfn >>= PAGE_SHIFT; |
469 | ret = __memory_failure(pfn, 0, 0); | 469 | ret = memory_failure(pfn, 0, 0); |
470 | return ret ? ret : count; | 470 | return ret ? ret : count; |
471 | } | 471 | } |
472 | 472 | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index ee67e326b6f8..7330742e7973 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -1600,9 +1600,9 @@ void vmemmap_populate_print_last(void); | |||
1600 | 1600 | ||
1601 | enum mf_flags { | 1601 | enum mf_flags { |
1602 | MF_COUNT_INCREASED = 1 << 0, | 1602 | MF_COUNT_INCREASED = 1 << 0, |
1603 | MF_ACTION_REQUIRED = 1 << 1, | ||
1603 | }; | 1604 | }; |
1604 | extern void memory_failure(unsigned long pfn, int trapno); | 1605 | extern int memory_failure(unsigned long pfn, int trapno, int flags); |
1605 | extern int __memory_failure(unsigned long pfn, int trapno, int flags); | ||
1606 | extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); | 1606 | extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); |
1607 | extern int unpoison_memory(unsigned long pfn); | 1607 | extern int unpoison_memory(unsigned long pfn); |
1608 | extern int sysctl_memory_failure_early_kill; | 1608 | extern int sysctl_memory_failure_early_kill; |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index c7fc7fd00e32..cc448bb983ba 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -45,7 +45,7 @@ static int hwpoison_inject(void *data, u64 val) | |||
45 | * do a racy check with elevated page count, to make sure PG_hwpoison | 45 | * do a racy check with elevated page count, to make sure PG_hwpoison |
46 | * will only be set for the targeted owner (or on a free page). | 46 | * will only be set for the targeted owner (or on a free page). |
47 | * We temporarily take page lock for try_get_mem_cgroup_from_page(). | 47 | * We temporarily take page lock for try_get_mem_cgroup_from_page(). |
48 | * __memory_failure() will redo the check reliably inside page lock. | 48 | * memory_failure() will redo the check reliably inside page lock. |
49 | */ | 49 | */ |
50 | lock_page(hpage); | 50 | lock_page(hpage); |
51 | err = hwpoison_filter(hpage); | 51 | err = hwpoison_filter(hpage); |
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val) | |||
55 | 55 | ||
56 | inject: | 56 | inject: |
57 | printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); | 57 | printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); |
58 | return __memory_failure(pfn, 18, MF_COUNT_INCREASED); | 58 | return memory_failure(pfn, 18, MF_COUNT_INCREASED); |
59 | } | 59 | } |
60 | 60 | ||
61 | static int hwpoison_unpoison(void *data, u64 val) | 61 | static int hwpoison_unpoison(void *data, u64 val) |
diff --git a/mm/madvise.c b/mm/madvise.c index 74bf193eff04..f5ab745672b7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -251,7 +251,7 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) | |||
251 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", | 251 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", |
252 | page_to_pfn(p), start); | 252 | page_to_pfn(p), start); |
253 | /* Ignore return value for now */ | 253 | /* Ignore return value for now */ |
254 | __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); | 254 | memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); |
255 | } | 255 | } |
256 | return ret; | 256 | return ret; |
257 | } | 257 | } |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c22076ffdd44..97cc2733551a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -187,33 +187,40 @@ int hwpoison_filter(struct page *p) | |||
187 | EXPORT_SYMBOL_GPL(hwpoison_filter); | 187 | EXPORT_SYMBOL_GPL(hwpoison_filter); |
188 | 188 | ||
189 | /* | 189 | /* |
190 | * Send all the processes who have the page mapped an ``action optional'' | 190 | * Send all the processes who have the page mapped a signal. |
191 | * signal. | 191 | * ``action optional'' if they are not immediately affected by the error |
192 | * ``action required'' if error happened in current execution context | ||
192 | */ | 193 | */ |
193 | static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | 194 | static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, |
194 | unsigned long pfn, struct page *page) | 195 | unsigned long pfn, struct page *page, int flags) |
195 | { | 196 | { |
196 | struct siginfo si; | 197 | struct siginfo si; |
197 | int ret; | 198 | int ret; |
198 | 199 | ||
199 | printk(KERN_ERR | 200 | printk(KERN_ERR |
200 | "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", | 201 | "MCE %#lx: Killing %s:%d due to hardware memory corruption\n", |
201 | pfn, t->comm, t->pid); | 202 | pfn, t->comm, t->pid); |
202 | si.si_signo = SIGBUS; | 203 | si.si_signo = SIGBUS; |
203 | si.si_errno = 0; | 204 | si.si_errno = 0; |
204 | si.si_code = BUS_MCEERR_AO; | ||
205 | si.si_addr = (void *)addr; | 205 | si.si_addr = (void *)addr; |
206 | #ifdef __ARCH_SI_TRAPNO | 206 | #ifdef __ARCH_SI_TRAPNO |
207 | si.si_trapno = trapno; | 207 | si.si_trapno = trapno; |
208 | #endif | 208 | #endif |
209 | si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; | 209 | si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; |
210 | /* | 210 | |
211 | * Don't use force here, it's convenient if the signal | 211 | if ((flags & MF_ACTION_REQUIRED) && t == current) { |
212 | * can be temporarily blocked. | 212 | si.si_code = BUS_MCEERR_AR; |
213 | * This could cause a loop when the user sets SIGBUS | 213 | ret = force_sig_info(SIGBUS, &si, t); |
214 | * to SIG_IGN, but hopefully no one will do that? | 214 | } else { |
215 | */ | 215 | /* |
216 | ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ | 216 | * Don't use force here, it's convenient if the signal |
217 | * can be temporarily blocked. | ||
218 | * This could cause a loop when the user sets SIGBUS | ||
219 | * to SIG_IGN, but hopefully no one will do that? | ||
220 | */ | ||
221 | si.si_code = BUS_MCEERR_AO; | ||
222 | ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ | ||
223 | } | ||
217 | if (ret < 0) | 224 | if (ret < 0) |
218 | printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", | 225 | printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", |
219 | t->comm, t->pid, ret); | 226 | t->comm, t->pid, ret); |
@@ -338,8 +345,9 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, | |||
338 | * Also when FAIL is set do a force kill because something went | 345 | * Also when FAIL is set do a force kill because something went |
339 | * wrong earlier. | 346 | * wrong earlier. |
340 | */ | 347 | */ |
341 | static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, | 348 | static void kill_procs(struct list_head *to_kill, int doit, int trapno, |
342 | int fail, struct page *page, unsigned long pfn) | 349 | int fail, struct page *page, unsigned long pfn, |
350 | int flags) | ||
343 | { | 351 | { |
344 | struct to_kill *tk, *next; | 352 | struct to_kill *tk, *next; |
345 | 353 | ||
@@ -363,8 +371,8 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, | |||
363 | * check for that, but we need to tell the | 371 | * check for that, but we need to tell the |
364 | * process anyways. | 372 | * process anyways. |
365 | */ | 373 | */ |
366 | else if (kill_proc_ao(tk->tsk, tk->addr, trapno, | 374 | else if (kill_proc(tk->tsk, tk->addr, trapno, |
367 | pfn, page) < 0) | 375 | pfn, page, flags) < 0) |
368 | printk(KERN_ERR | 376 | printk(KERN_ERR |
369 | "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", | 377 | "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", |
370 | pfn, tk->tsk->comm, tk->tsk->pid); | 378 | pfn, tk->tsk->comm, tk->tsk->pid); |
@@ -844,7 +852,7 @@ static int page_action(struct page_state *ps, struct page *p, | |||
844 | * the pages and send SIGBUS to the processes if the data was dirty. | 852 | * the pages and send SIGBUS to the processes if the data was dirty. |
845 | */ | 853 | */ |
846 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | 854 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, |
847 | int trapno) | 855 | int trapno, int flags) |
848 | { | 856 | { |
849 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | 857 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
850 | struct address_space *mapping; | 858 | struct address_space *mapping; |
@@ -962,8 +970,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
962 | * use a more force-full uncatchable kill to prevent | 970 | * use a more force-full uncatchable kill to prevent |
963 | * any accesses to the poisoned memory. | 971 | * any accesses to the poisoned memory. |
964 | */ | 972 | */ |
965 | kill_procs_ao(&tokill, !!PageDirty(ppage), trapno, | 973 | kill_procs(&tokill, !!PageDirty(ppage), trapno, |
966 | ret != SWAP_SUCCESS, p, pfn); | 974 | ret != SWAP_SUCCESS, p, pfn, flags); |
967 | 975 | ||
968 | return ret; | 976 | return ret; |
969 | } | 977 | } |
@@ -984,7 +992,25 @@ static void clear_page_hwpoison_huge_page(struct page *hpage) | |||
984 | ClearPageHWPoison(hpage + i); | 992 | ClearPageHWPoison(hpage + i); |
985 | } | 993 | } |
986 | 994 | ||
987 | int __memory_failure(unsigned long pfn, int trapno, int flags) | 995 | /** |
996 | * memory_failure - Handle memory failure of a page. | ||
997 | * @pfn: Page Number of the corrupted page | ||
998 | * @trapno: Trap number reported in the signal to user space. | ||
999 | * @flags: fine tune action taken | ||
1000 | * | ||
1001 | * This function is called by the low level machine check code | ||
1002 | * of an architecture when it detects hardware memory corruption | ||
1003 | * of a page. It tries its best to recover, which includes | ||
1004 | * dropping pages, killing processes etc. | ||
1005 | * | ||
1006 | * The function is primarily of use for corruptions that | ||
1007 | * happen outside the current execution context (e.g. when | ||
1008 | * detected by a background scrubber) | ||
1009 | * | ||
1010 | * Must run in process context (e.g. a work queue) with interrupts | ||
1011 | * enabled and no spinlocks hold. | ||
1012 | */ | ||
1013 | int memory_failure(unsigned long pfn, int trapno, int flags) | ||
988 | { | 1014 | { |
989 | struct page_state *ps; | 1015 | struct page_state *ps; |
990 | struct page *p; | 1016 | struct page *p; |
@@ -1130,7 +1156,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1130 | * Now take care of user space mappings. | 1156 | * Now take care of user space mappings. |
1131 | * Abort on fail: __delete_from_page_cache() assumes unmapped page. | 1157 | * Abort on fail: __delete_from_page_cache() assumes unmapped page. |
1132 | */ | 1158 | */ |
1133 | if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { | 1159 | if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) { |
1134 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); | 1160 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); |
1135 | res = -EBUSY; | 1161 | res = -EBUSY; |
1136 | goto out; | 1162 | goto out; |
@@ -1156,29 +1182,7 @@ out: | |||
1156 | unlock_page(hpage); | 1182 | unlock_page(hpage); |
1157 | return res; | 1183 | return res; |
1158 | } | 1184 | } |
1159 | EXPORT_SYMBOL_GPL(__memory_failure); | 1185 | EXPORT_SYMBOL_GPL(memory_failure); |
1160 | |||
1161 | /** | ||
1162 | * memory_failure - Handle memory failure of a page. | ||
1163 | * @pfn: Page Number of the corrupted page | ||
1164 | * @trapno: Trap number reported in the signal to user space. | ||
1165 | * | ||
1166 | * This function is called by the low level machine check code | ||
1167 | * of an architecture when it detects hardware memory corruption | ||
1168 | * of a page. It tries its best to recover, which includes | ||
1169 | * dropping pages, killing processes etc. | ||
1170 | * | ||
1171 | * The function is primarily of use for corruptions that | ||
1172 | * happen outside the current execution context (e.g. when | ||
1173 | * detected by a background scrubber) | ||
1174 | * | ||
1175 | * Must run in process context (e.g. a work queue) with interrupts | ||
1176 | * enabled and no spinlocks hold. | ||
1177 | */ | ||
1178 | void memory_failure(unsigned long pfn, int trapno) | ||
1179 | { | ||
1180 | __memory_failure(pfn, trapno, 0); | ||
1181 | } | ||
1182 | 1186 | ||
1183 | #define MEMORY_FAILURE_FIFO_ORDER 4 | 1187 | #define MEMORY_FAILURE_FIFO_ORDER 4 |
1184 | #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) | 1188 | #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) |
@@ -1251,7 +1255,7 @@ static void memory_failure_work_func(struct work_struct *work) | |||
1251 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); | 1255 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); |
1252 | if (!gotten) | 1256 | if (!gotten) |
1253 | break; | 1257 | break; |
1254 | __memory_failure(entry.pfn, entry.trapno, entry.flags); | 1258 | memory_failure(entry.pfn, entry.trapno, entry.flags); |
1255 | } | 1259 | } |
1256 | } | 1260 | } |
1257 | 1261 | ||