diff options
author | Robin Getz <robin.getz@analog.com> | 2009-05-18 14:33:26 -0400 |
---|---|---|
committer | Mike Frysinger <vapier@gentoo.org> | 2009-06-12 06:11:44 -0400 |
commit | b9a3899d59c3f0fc074573f0eba2419b1e4c0bca (patch) | |
tree | 09d20c3b062604d615c9d224c06c5df5a5acbcec | |
parent | 97b070c8e7e82be30c8a3bf19e69b8c0c71f1fac (diff) |
Blackfin: make deferred hardware errors more exact
Hardware errors on the Blackfin architecture are queued by nature of the
hardware design. Things that could generate a hardware level queue up at
the system interface and might not process until much later, at which
point the system would send a notification back to the core.
As such, it is possible for user space code to do something that would
trigger a hardware error, but have it delay long enough for the process
context to switch. So when the hardware error does signal, we mistakenly
evaluate it as a different process or as kernel context and panic (erp!).
This makes it pretty difficult to find the offending context. But wait,
there is good news somewhere.
By forcing a SSYNC in the interrupt entry, we force all pending queues at
the system level to be processed and all hardware errors to be signaled.
Then we check the current interrupt state to see if the hardware error is
now signaled. If so, we re-queue the current interrupt and return thus
allowing the higher priority hardware error interrupt to process properly.
Since we haven't done any other context processing yet, the right context
will be selected and killed. There is still the possibility that the
exact offending instruction will be unknown, but at least we'll have a
much better idea of where to look.
The downside of course is that this causes system-wide syncs at every
interrupt point which results in significant performance degradation.
Since this situation should not occur in any properly configured system
(as hardware errors are triggered by things like bad pointers), make it a
debug configuration option and disable it by default.
Signed-off-by: Robin Getz <robin.getz@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
-rw-r--r-- | arch/blackfin/Kconfig.debug | 13 | ||||
-rw-r--r-- | arch/blackfin/include/asm/entry.h | 92 | ||||
-rw-r--r-- | arch/blackfin/mach-common/entry.S | 59 | ||||
-rw-r--r-- | arch/blackfin/mach-common/interrupt.S | 27 |
4 files changed, 170 insertions, 21 deletions
diff --git a/arch/blackfin/Kconfig.debug b/arch/blackfin/Kconfig.debug index 79e7e63ab709..1fc4981d486f 100644 --- a/arch/blackfin/Kconfig.debug +++ b/arch/blackfin/Kconfig.debug | |||
@@ -54,6 +54,19 @@ config DEBUG_HWERR | |||
54 | hardware error interrupts and need to know where they are coming | 54 | hardware error interrupts and need to know where they are coming |
55 | from. | 55 | from. |
56 | 56 | ||
57 | config EXACT_HWERR | ||
58 | bool "Try to make Hardware errors exact" | ||
59 | depends on DEBUG_HWERR | ||
60 | help | ||
61 | By default, the Blackfin hardware errors are not exact - the error | ||
62 | be reported multiple cycles after the error happens. This delay | ||
63 | can cause the wrong application, or even the kernel to receive a | ||
64 | signal to be killed. If you are getting HW errors in your system, | ||
65 | try turning this on to ensure they are at least comming from the | ||
66 | proper thread. | ||
67 | |||
68 | On production systems, it is safe (and a small optimization) to say N. | ||
69 | |||
57 | config DEBUG_DOUBLEFAULT | 70 | config DEBUG_DOUBLEFAULT |
58 | bool "Debug Double Faults" | 71 | bool "Debug Double Faults" |
59 | default n | 72 | default n |
diff --git a/arch/blackfin/include/asm/entry.h b/arch/blackfin/include/asm/entry.h index b30a2968e274..ec58efc130e6 100644 --- a/arch/blackfin/include/asm/entry.h +++ b/arch/blackfin/include/asm/entry.h | |||
@@ -35,21 +35,39 @@ | |||
35 | #else | 35 | #else |
36 | # define LOAD_IPIPE_IPEND | 36 | # define LOAD_IPIPE_IPEND |
37 | #endif | 37 | #endif |
38 | |||
39 | #ifndef CONFIG_EXACT_HWERR | ||
40 | /* As a debugging aid - we save IPEND when DEBUG_KERNEL is on, | ||
41 | * otherwise it is a waste of cycles. | ||
42 | */ | ||
43 | # ifndef CONFIG_DEBUG_KERNEL | ||
44 | #define INTERRUPT_ENTRY(N) \ | ||
45 | [--sp] = SYSCFG; \ | ||
46 | [--sp] = P0; /*orig_p0*/ \ | ||
47 | [--sp] = R0; /*orig_r0*/ \ | ||
48 | [--sp] = (R7:0,P5:0); \ | ||
49 | R0 = (N); \ | ||
50 | LOAD_IPIPE_IPEND \ | ||
51 | jump __common_int_entry; | ||
52 | # else /* CONFIG_DEBUG_KERNEL */ | ||
38 | #define INTERRUPT_ENTRY(N) \ | 53 | #define INTERRUPT_ENTRY(N) \ |
39 | [--sp] = SYSCFG; \ | 54 | [--sp] = SYSCFG; \ |
40 | \ | ||
41 | [--sp] = P0; /*orig_p0*/ \ | 55 | [--sp] = P0; /*orig_p0*/ \ |
42 | [--sp] = R0; /*orig_r0*/ \ | 56 | [--sp] = R0; /*orig_r0*/ \ |
43 | [--sp] = (R7:0,P5:0); \ | 57 | [--sp] = (R7:0,P5:0); \ |
58 | p0.l = lo(IPEND); \ | ||
59 | p0.h = hi(IPEND); \ | ||
60 | r1 = [p0]; \ | ||
44 | R0 = (N); \ | 61 | R0 = (N); \ |
45 | LOAD_IPIPE_IPEND \ | 62 | LOAD_IPIPE_IPEND \ |
46 | jump __common_int_entry; | 63 | jump __common_int_entry; |
64 | # endif /* CONFIG_DEBUG_KERNEL */ | ||
47 | 65 | ||
48 | /* For timer interrupts, we need to save IPEND, since the user_mode | 66 | /* For timer interrupts, we need to save IPEND, since the user_mode |
49 | macro accesses it to determine where to account time. */ | 67 | *macro accesses it to determine where to account time. |
68 | */ | ||
50 | #define TIMER_INTERRUPT_ENTRY(N) \ | 69 | #define TIMER_INTERRUPT_ENTRY(N) \ |
51 | [--sp] = SYSCFG; \ | 70 | [--sp] = SYSCFG; \ |
52 | \ | ||
53 | [--sp] = P0; /*orig_p0*/ \ | 71 | [--sp] = P0; /*orig_p0*/ \ |
54 | [--sp] = R0; /*orig_r0*/ \ | 72 | [--sp] = R0; /*orig_r0*/ \ |
55 | [--sp] = (R7:0,P5:0); \ | 73 | [--sp] = (R7:0,P5:0); \ |
@@ -58,6 +76,74 @@ | |||
58 | r1 = [p0]; \ | 76 | r1 = [p0]; \ |
59 | R0 = (N); \ | 77 | R0 = (N); \ |
60 | jump __common_int_entry; | 78 | jump __common_int_entry; |
79 | #else /* CONFIG_EXACT_HWERR is defined */ | ||
80 | |||
81 | /* if we want hardware error to be exact, we need to do a SSYNC (which forces | ||
82 | * read/writes to complete to the memory controllers), and check to see that | ||
83 | * caused a pending HW error condition. If so, we assume it was caused by user | ||
84 | * space, by setting the same interrupt that we are in (so it goes off again) | ||
85 | * and context restore, and a RTI (without servicing anything). This should | ||
86 | * cause the pending HWERR to fire, and when that is done, this interrupt will | ||
87 | * be re-serviced properly. | ||
88 | * As you can see by the code - we actually need to do two SSYNCS - one to | ||
89 | * make sure the read/writes complete, and another to make sure the hardware | ||
90 | * error is recognized by the core. | ||
91 | */ | ||
92 | #define INTERRUPT_ENTRY(N) \ | ||
93 | SSYNC; \ | ||
94 | SSYNC; \ | ||
95 | [--sp] = SYSCFG; \ | ||
96 | [--sp] = P0; /*orig_p0*/ \ | ||
97 | [--sp] = R0; /*orig_r0*/ \ | ||
98 | [--sp] = (R7:0,P5:0); \ | ||
99 | R1 = ASTAT; \ | ||
100 | P0.L = LO(ILAT); \ | ||
101 | P0.H = HI(ILAT); \ | ||
102 | R0 = [P0]; \ | ||
103 | CC = BITTST(R0, EVT_IVHW_P); \ | ||
104 | IF CC JUMP 1f; \ | ||
105 | ASTAT = R1; \ | ||
106 | p0.l = lo(IPEND); \ | ||
107 | p0.h = hi(IPEND); \ | ||
108 | r1 = [p0]; \ | ||
109 | R0 = (N); \ | ||
110 | LOAD_IPIPE_IPEND \ | ||
111 | jump __common_int_entry; \ | ||
112 | 1: ASTAT = R1; \ | ||
113 | RAISE N; \ | ||
114 | (R7:0, P5:0) = [SP++]; \ | ||
115 | SP += 0x8; \ | ||
116 | SYSCFG = [SP++]; \ | ||
117 | CSYNC; \ | ||
118 | RTI; | ||
119 | |||
120 | #define TIMER_INTERRUPT_ENTRY(N) \ | ||
121 | SSYNC; \ | ||
122 | SSYNC; \ | ||
123 | [--sp] = SYSCFG; \ | ||
124 | [--sp] = P0; /*orig_p0*/ \ | ||
125 | [--sp] = R0; /*orig_r0*/ \ | ||
126 | [--sp] = (R7:0,P5:0); \ | ||
127 | R1 = ASTAT; \ | ||
128 | P0.L = LO(ILAT); \ | ||
129 | P0.H = HI(ILAT); \ | ||
130 | R0 = [P0]; \ | ||
131 | CC = BITTST(R0, EVT_IVHW_P); \ | ||
132 | IF CC JUMP 1f; \ | ||
133 | ASTAT = R1; \ | ||
134 | p0.l = lo(IPEND); \ | ||
135 | p0.h = hi(IPEND); \ | ||
136 | r1 = [p0]; \ | ||
137 | R0 = (N); \ | ||
138 | jump __common_int_entry; \ | ||
139 | 1: ASTAT = R1; \ | ||
140 | RAISE N; \ | ||
141 | (R7:0, P5:0) = [SP++]; \ | ||
142 | SP += 0x8; \ | ||
143 | SYSCFG = [SP++]; \ | ||
144 | CSYNC; \ | ||
145 | RTI; | ||
146 | #endif /* CONFIG_EXACT_HWERR */ | ||
61 | 147 | ||
62 | /* This one pushes RETI without using CLI. Interrupts are enabled. */ | 148 | /* This one pushes RETI without using CLI. Interrupts are enabled. */ |
63 | #define SAVE_CONTEXT_SYSCALL save_context_syscall | 149 | #define SAVE_CONTEXT_SYSCALL save_context_syscall |
diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S index f0636fdcb353..da0558ad1b1a 100644 --- a/arch/blackfin/mach-common/entry.S +++ b/arch/blackfin/mach-common/entry.S | |||
@@ -200,7 +200,18 @@ ENTRY(_ex_single_step) | |||
200 | cc = r7 == 0; | 200 | cc = r7 == 0; |
201 | if !cc jump 1f; | 201 | if !cc jump 1f; |
202 | #endif | 202 | #endif |
203 | 203 | #ifdef CONFIG_EXACT_HWERR | |
204 | /* Read the ILAT, and to check to see if the process we are | ||
205 | * single stepping caused a previous hardware error | ||
206 | * If so, do not single step, (which lowers to IRQ5, and makes | ||
207 | * us miss the error). | ||
208 | */ | ||
209 | p5.l = lo(ILAT); | ||
210 | p5.h = hi(ILAT); | ||
211 | r7 = [p5]; | ||
212 | cc = bittst(r7, EVT_IVHW_P); | ||
213 | if cc jump 1f; | ||
214 | #endif | ||
204 | /* Single stepping only a single instruction, so clear the trace | 215 | /* Single stepping only a single instruction, so clear the trace |
205 | * bit here. */ | 216 | * bit here. */ |
206 | r7 = syscfg; | 217 | r7 = syscfg; |
@@ -262,15 +273,6 @@ ENTRY(_bfin_return_from_exception) | |||
262 | r6 = 0x25; | 273 | r6 = 0x25; |
263 | CC = R7 == R6; | 274 | CC = R7 == R6; |
264 | if CC JUMP _double_fault; | 275 | if CC JUMP _double_fault; |
265 | |||
266 | /* Did we cause a HW error? */ | ||
267 | p5.l = lo(ILAT); | ||
268 | p5.h = hi(ILAT); | ||
269 | r6 = [p5]; | ||
270 | r7 = 0x20; /* Did I just cause anther HW error? */ | ||
271 | r6 = r7 & r6; | ||
272 | CC = R7 == R6; | ||
273 | if CC JUMP _double_fault; | ||
274 | #endif | 276 | #endif |
275 | 277 | ||
276 | (R7:6,P5:4) = [sp++]; | 278 | (R7:6,P5:4) = [sp++]; |
@@ -472,6 +474,16 @@ ENTRY(_trap) /* Exception: 4th entry into system event table(supervisor mode)*/ | |||
472 | [--sp] = ASTAT; | 474 | [--sp] = ASTAT; |
473 | [--sp] = (R7:6,P5:4); | 475 | [--sp] = (R7:6,P5:4); |
474 | 476 | ||
477 | #ifdef CONFIG_EXACT_HWERR | ||
478 | /* Make sure all pending read/writes complete. This will ensure any | ||
479 | * accesses which could cause hardware errors completes, and signal | ||
480 | * the the hardware before we do something silly, like crash the | ||
481 | * kernel. We don't need to work around anomaly 05000312, since | ||
482 | * we are already atomic | ||
483 | */ | ||
484 | ssync; | ||
485 | #endif | ||
486 | |||
475 | #if ANOMALY_05000283 || ANOMALY_05000315 | 487 | #if ANOMALY_05000283 || ANOMALY_05000315 |
476 | cc = r7 == r7; | 488 | cc = r7 == r7; |
477 | p5.h = HI(CHIPID); | 489 | p5.h = HI(CHIPID); |
@@ -854,7 +866,7 @@ ENTRY(_ret_from_exception) | |||
854 | p1.h = _schedule_and_signal; | 866 | p1.h = _schedule_and_signal; |
855 | [p0] = p1; | 867 | [p0] = p1; |
856 | csync; | 868 | csync; |
857 | raise 15; /* raise evt14 to do signal or reschedule */ | 869 | raise 15; /* raise evt15 to do signal or reschedule */ |
858 | 4: | 870 | 4: |
859 | r0 = syscfg; | 871 | r0 = syscfg; |
860 | bitclr(r0, 0); | 872 | bitclr(r0, 0); |
@@ -915,7 +927,7 @@ ENTRY(_return_from_int) | |||
915 | p1.h = _schedule_and_signal_from_int; | 927 | p1.h = _schedule_and_signal_from_int; |
916 | [p0] = p1; | 928 | [p0] = p1; |
917 | csync; | 929 | csync; |
918 | #if ANOMALY_05000281 | 930 | #if ANOMALY_05000281 || ANOMALY_05000461 |
919 | r0.l = lo(SAFE_USER_INSTRUCTION); | 931 | r0.l = lo(SAFE_USER_INSTRUCTION); |
920 | r0.h = hi(SAFE_USER_INSTRUCTION); | 932 | r0.h = hi(SAFE_USER_INSTRUCTION); |
921 | reti = r0; | 933 | reti = r0; |
@@ -929,18 +941,27 @@ ENTRY(_return_from_int) | |||
929 | ENDPROC(_return_from_int) | 941 | ENDPROC(_return_from_int) |
930 | 942 | ||
931 | ENTRY(_lower_to_irq14) | 943 | ENTRY(_lower_to_irq14) |
932 | #if ANOMALY_05000281 | 944 | #if ANOMALY_05000281 || ANOMALY_05000461 |
933 | r0.l = lo(SAFE_USER_INSTRUCTION); | 945 | r0.l = lo(SAFE_USER_INSTRUCTION); |
934 | r0.h = hi(SAFE_USER_INSTRUCTION); | 946 | r0.h = hi(SAFE_USER_INSTRUCTION); |
935 | reti = r0; | 947 | reti = r0; |
936 | #endif | 948 | #endif |
937 | r0 = 0x401f; | 949 | |
950 | #ifdef CONFIG_DEBUG_HWERR | ||
951 | /* enable irq14 & hwerr interrupt, until we transition to _evt14_softirq */ | ||
952 | r0 = (EVT_IVG14 | EVT_IVHW | EVT_IRPTEN | EVT_EVX | EVT_NMI | EVT_RST | EVT_EMU); | ||
953 | #else | ||
954 | /* Only enable irq14 interrupt, until we transition to _evt14_softirq */ | ||
955 | r0 = (EVT_IVG14 | EVT_IRPTEN | EVT_EVX | EVT_NMI | EVT_RST | EVT_EMU); | ||
956 | #endif | ||
938 | sti r0; | 957 | sti r0; |
939 | raise 14; | 958 | raise 14; |
940 | rti; | 959 | rti; |
960 | ENDPROC(_lower_to_irq14) | ||
961 | |||
941 | ENTRY(_evt14_softirq) | 962 | ENTRY(_evt14_softirq) |
942 | #ifdef CONFIG_DEBUG_HWERR | 963 | #ifdef CONFIG_DEBUG_HWERR |
943 | r0 = 0x3f; | 964 | r0 = (EVT_IVHW | EVT_IRPTEN | EVT_EVX | EVT_NMI | EVT_RST | EVT_EMU); |
944 | sti r0; | 965 | sti r0; |
945 | #else | 966 | #else |
946 | cli r0; | 967 | cli r0; |
@@ -948,8 +969,9 @@ ENTRY(_evt14_softirq) | |||
948 | [--sp] = RETI; | 969 | [--sp] = RETI; |
949 | SP += 4; | 970 | SP += 4; |
950 | rts; | 971 | rts; |
972 | ENDPROC(_evt14_softirq) | ||
951 | 973 | ||
952 | _schedule_and_signal_from_int: | 974 | ENTRY(_schedule_and_signal_from_int) |
953 | /* To end up here, vector 15 was changed - so we have to change it | 975 | /* To end up here, vector 15 was changed - so we have to change it |
954 | * back. | 976 | * back. |
955 | */ | 977 | */ |
@@ -982,8 +1004,9 @@ _schedule_and_signal_from_int: | |||
982 | call _finish_atomic_sections; | 1004 | call _finish_atomic_sections; |
983 | sp += 12; | 1005 | sp += 12; |
984 | jump.s .Lresume_userspace; | 1006 | jump.s .Lresume_userspace; |
1007 | ENDPROC(_schedule_and_signal_from_int) | ||
985 | 1008 | ||
986 | _schedule_and_signal: | 1009 | ENTRY(_schedule_and_signal) |
987 | SAVE_CONTEXT_SYSCALL | 1010 | SAVE_CONTEXT_SYSCALL |
988 | /* To end up here, vector 15 was changed - so we have to change it | 1011 | /* To end up here, vector 15 was changed - so we have to change it |
989 | * back. | 1012 | * back. |
@@ -1001,7 +1024,7 @@ _schedule_and_signal: | |||
1001 | 1: | 1024 | 1: |
1002 | RESTORE_CONTEXT | 1025 | RESTORE_CONTEXT |
1003 | rti; | 1026 | rti; |
1004 | ENDPROC(_lower_to_irq14) | 1027 | ENDPROC(_schedule_and_signal) |
1005 | 1028 | ||
1006 | /* We handle this 100% in exception space - to reduce overhead | 1029 | /* We handle this 100% in exception space - to reduce overhead |
1007 | * Only potiential problem is if the software buffer gets swapped out of the | 1030 | * Only potiential problem is if the software buffer gets swapped out of the |
diff --git a/arch/blackfin/mach-common/interrupt.S b/arch/blackfin/mach-common/interrupt.S index 0069c2dd4625..9c46680186e4 100644 --- a/arch/blackfin/mach-common/interrupt.S +++ b/arch/blackfin/mach-common/interrupt.S | |||
@@ -145,6 +145,14 @@ __common_int_entry: | |||
145 | 145 | ||
146 | /* interrupt routine for ivhw - 5 */ | 146 | /* interrupt routine for ivhw - 5 */ |
147 | ENTRY(_evt_ivhw) | 147 | ENTRY(_evt_ivhw) |
148 | /* In case a single action kicks off multiple memory transactions, (like | ||
149 | * a cache line fetch, - this can cause multiple hardware errors, let's | ||
150 | * catch them all. First - make sure all the actions are complete, and | ||
151 | * the core sees the hardware errors. | ||
152 | */ | ||
153 | SSYNC; | ||
154 | SSYNC; | ||
155 | |||
148 | SAVE_ALL_SYS | 156 | SAVE_ALL_SYS |
149 | #ifdef CONFIG_FRAME_POINTER | 157 | #ifdef CONFIG_FRAME_POINTER |
150 | fp = 0; | 158 | fp = 0; |
@@ -159,6 +167,25 @@ ENTRY(_evt_ivhw) | |||
159 | 1: | 167 | 1: |
160 | #endif | 168 | #endif |
161 | 169 | ||
170 | /* Handle all stacked hardware errors | ||
171 | * To make sure we don't hang forever, only do it 10 times | ||
172 | */ | ||
173 | R0 = 0; | ||
174 | R2 = 10; | ||
175 | 1: | ||
176 | P0.L = LO(ILAT); | ||
177 | P0.H = HI(ILAT); | ||
178 | R1 = [P0]; | ||
179 | CC = BITTST(R1, EVT_IVHW_P); | ||
180 | IF ! CC JUMP 2f; | ||
181 | /* OK a hardware error is pending - clear it */ | ||
182 | R1 = EVT_IVHW_P; | ||
183 | [P0] = R1; | ||
184 | R0 += 1; | ||
185 | CC = R1 == R2; | ||
186 | if CC JUMP 2f; | ||
187 | JUMP 1b; | ||
188 | 2: | ||
162 | # We are going to dump something out, so make sure we print IPEND properly | 189 | # We are going to dump something out, so make sure we print IPEND properly |
163 | p2.l = lo(IPEND); | 190 | p2.l = lo(IPEND); |
164 | p2.h = hi(IPEND); | 191 | p2.h = hi(IPEND); |