aboutsummaryrefslogtreecommitdiffstats
path: root/arch/tile/kernel/intvec_64.S
diff options
context:
space:
mode:
authorChris Metcalf <cmetcalf@tilera.com>2013-08-06 16:04:13 -0400
committerChris Metcalf <cmetcalf@tilera.com>2013-08-13 16:04:10 -0400
commit2f9ac29eec71a696cb0dcc5fb82c0f8d4dac28c9 (patch)
treeee33ba7e452e8614130a811211eb2383a3133194 /arch/tile/kernel/intvec_64.S
parentf10da5472c6907a3fbd6886224b36d21925ce47b (diff)
tile: fast-path unaligned memory access for tilegx
This change enables unaligned userspace memory access via a kernel fast path on tilegx. The kernel tracks user PC/instruction pairs per-thread using a direct-mapped cache in userspace. The cache maps those PC/instruction pairs to JIT'ed instruction sequences that load or store using byte-wide load store intructions and then synthesize 2-, 4- or 8-byte load or store results. Once an instruction has been seen to generate an unaligned access once, subsequent hits on that instruction typically require overhead of only around 50 cycles if cache and TLB is hot. We support the prctl() PR_GET_UNALIGN / PR_SET_UNALIGN sys call to enable or disable unaligned fixups on a per-process basis. To do this we pull some of the tilepro unaligned support out of the single_step.c file; tilepro uses instruction disassembly for both single-step and unaligned access support. Since tilegx actually has hardware singlestep support, though, it's cleaner to keep the tilegx unaligned access code in a separate file. While we're at it, properly rename the tilepro-specific types, etc., to have tilepro suffixes instead of generic tile suffixes. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Diffstat (limited to 'arch/tile/kernel/intvec_64.S')
-rw-r--r--arch/tile/kernel/intvec_64.S231
1 files changed, 224 insertions, 7 deletions
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index 85d483957027..884af9ea5bed 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -17,11 +17,13 @@
17#include <linux/linkage.h> 17#include <linux/linkage.h>
18#include <linux/errno.h> 18#include <linux/errno.h>
19#include <linux/unistd.h> 19#include <linux/unistd.h>
20#include <linux/init.h>
20#include <asm/ptrace.h> 21#include <asm/ptrace.h>
21#include <asm/thread_info.h> 22#include <asm/thread_info.h>
22#include <asm/irqflags.h> 23#include <asm/irqflags.h>
23#include <asm/asm-offsets.h> 24#include <asm/asm-offsets.h>
24#include <asm/types.h> 25#include <asm/types.h>
26#include <asm/traps.h>
25#include <asm/signal.h> 27#include <asm/signal.h>
26#include <hv/hypervisor.h> 28#include <hv/hypervisor.h>
27#include <arch/abi.h> 29#include <arch/abi.h>
@@ -98,6 +100,189 @@
98 } 100 }
99 .endm 101 .endm
100 102
103 /*
104 * Unalign data exception fast handling: In order to handle
105 * unaligned data access, a fast JIT version is generated and stored
106 * in a specific area in user space. We first need to do a quick poke
107 * to see if the JIT is available. We use certain bits in the fault
108 * PC (3 to 9 is used for 16KB page size) as index to address the JIT
109 * code area. The first 64bit word is the fault PC, and the 2nd one is
110 * the fault bundle itself. If these 2 words both match, then we
111 * directly "iret" to JIT code. If not, a slow path is invoked to
112 * generate new JIT code. Note: the current JIT code WILL be
113 * overwritten if it existed. So, ideally we can handle 128 unalign
114 * fixups via JIT. For lookup efficiency and to effectively support
115 * tight loops with multiple unaligned reference, a simple
116 * direct-mapped cache is used.
117 *
118 * SPR_EX_CONTEXT_K_0 is modified to return to JIT code.
119 * SPR_EX_CONTEXT_K_1 has ICS set.
120 * SPR_EX_CONTEXT_0_0 is setup to user program's next PC.
121 * SPR_EX_CONTEXT_0_1 = 0.
122 */
123 .macro int_hand_unalign_fast vecnum, vecname
124 .org (\vecnum << 8)
125intvec_\vecname:
126 /* Put r3 in SPR_SYSTEM_SAVE_K_1. */
127 mtspr SPR_SYSTEM_SAVE_K_1, r3
128
129 mfspr r3, SPR_EX_CONTEXT_K_1
130 /*
131 * Examine if exception comes from user without ICS set.
132 * If not, just go directly to the slow path.
133 */
134 bnez r3, hand_unalign_slow_nonuser
135
136 mfspr r3, SPR_SYSTEM_SAVE_K_0
137
138 /* Get &thread_info->unalign_jit_tmp[0] in r3. */
139 mm r3, zero, LOG2_THREAD_SIZE, 63
140#if THREAD_SIZE < 65536
141 addli r3, r3, -(PAGE_SIZE - THREAD_INFO_UNALIGN_JIT_TMP_OFFSET)
142#else
143 addli r3, r3, -(PAGE_SIZE/2)
144 addli r3, r3, -(PAGE_SIZE/2 - THREAD_INFO_UNALIGN_JIT_TMP_OFFSET)
145#endif
146
147 /*
148 * Save r0, r1, r2 into thread_info array r3 points to
149 * from low to high memory in order.
150 */
151 st_add r3, r0, 8
152 st_add r3, r1, 8
153 {
154 st_add r3, r2, 8
155 andi r2, sp, 7
156 }
157
158 /* Save stored r3 value so we can revert it on a page fault. */
159 mfspr r1, SPR_SYSTEM_SAVE_K_1
160 st r3, r1
161
162 {
163 /* Generate a SIGBUS if sp is not 8-byte aligned. */
164 bnez r2, hand_unalign_slow_badsp
165 }
166
167 /*
168 * Get the thread_info in r0; load r1 with pc. Set the low bit of sp
169 * as an indicator to the page fault code in case we fault.
170 */
171 {
172 ori sp, sp, 1
173 mfspr r1, SPR_EX_CONTEXT_K_0
174 }
175
176 /* Add the jit_info offset in thread_info; extract r1 [3:9] into r2. */
177 {
178 addli r0, r3, THREAD_INFO_UNALIGN_JIT_BASE_OFFSET - \
179 (THREAD_INFO_UNALIGN_JIT_TMP_OFFSET + (3 * 8))
180 bfextu r2, r1, 3, (2 + PAGE_SHIFT - UNALIGN_JIT_SHIFT)
181 }
182
183 /* Load the jit_info; multiply r2 by 128. */
184 {
185 ld r0, r0
186 shli r2, r2, UNALIGN_JIT_SHIFT
187 }
188
189 /*
190 * If r0 is NULL, the JIT page is not mapped, so go to slow path;
191 * add offset r2 to r0 at the same time.
192 */
193 {
194 beqz r0, hand_unalign_slow
195 add r2, r0, r2
196 }
197
198 /*
199 * We are loading from userspace (both the JIT info PC and
200 * instruction word, and the instruction word we executed)
201 * and since either could fault while holding the interrupt
202 * critical section, we must tag this region and check it in
203 * do_page_fault() to handle it properly.
204 */
205ENTRY(__start_unalign_asm_code)
206
207 /* Load first word of JIT in r0 and increment r2 by 8. */
208 ld_add r0, r2, 8
209
210 /*
211 * Compare the PC with the 1st word in JIT; load the fault bundle
212 * into r1.
213 */
214 {
215 cmpeq r0, r0, r1
216 ld r1, r1
217 }
218
219 /* Go to slow path if PC doesn't match. */
220 beqz r0, hand_unalign_slow
221
222 /*
223 * Load the 2nd word of JIT, which is supposed to be the fault
224 * bundle for a cache hit. Increment r2; after this bundle r2 will
225 * point to the potential start of the JIT code we want to run.
226 */
227 ld_add r0, r2, 8
228
229 /* No further accesses to userspace are done after this point. */
230ENTRY(__end_unalign_asm_code)
231
232 /* Compare the real bundle with what is saved in the JIT area. */
233 {
234 cmpeq r0, r1, r0
235 mtspr SPR_EX_CONTEXT_0_1, zero
236 }
237
238 /* Go to slow path if the fault bundle does not match. */
239 beqz r0, hand_unalign_slow
240
241 /*
242 * A cache hit is found.
243 * r2 points to start of JIT code (3rd word).
244 * r0 is the fault pc.
245 * r1 is the fault bundle.
246 * Reset the low bit of sp.
247 */
248 {
249 mfspr r0, SPR_EX_CONTEXT_K_0
250 andi sp, sp, ~1
251 }
252
253 /* Write r2 into EX_CONTEXT_K_0 and increment PC. */
254 {
255 mtspr SPR_EX_CONTEXT_K_0, r2
256 addi r0, r0, 8
257 }
258
259 /*
260 * Set ICS on kernel EX_CONTEXT_K_1 in order to "iret" to
261 * user with ICS set. This way, if the JIT fixup causes another
262 * unalign exception (which shouldn't be possible) the user
263 * process will be terminated with SIGBUS. Also, our fixup will
264 * run without interleaving with external interrupts.
265 * Each fixup is at most 14 bundles, so it won't hold ICS for long.
266 */
267 {
268 movei r1, PL_ICS_EX1(USER_PL, 1)
269 mtspr SPR_EX_CONTEXT_0_0, r0
270 }
271
272 {
273 mtspr SPR_EX_CONTEXT_K_1, r1
274 addi r3, r3, -(3 * 8)
275 }
276
277 /* Restore r0..r3. */
278 ld_add r0, r3, 8
279 ld_add r1, r3, 8
280 ld_add r2, r3, 8
281 ld r3, r3
282
283 iret
284 ENDPROC(intvec_\vecname)
285 .endm
101 286
102#ifdef __COLLECT_LINKER_FEEDBACK__ 287#ifdef __COLLECT_LINKER_FEEDBACK__
103 .pushsection .text.intvec_feedback,"ax" 288 .pushsection .text.intvec_feedback,"ax"
@@ -118,15 +303,21 @@ intvec_feedback:
118 * The "processing" argument specifies the code for processing 303 * The "processing" argument specifies the code for processing
119 * the interrupt. Defaults to "handle_interrupt". 304 * the interrupt. Defaults to "handle_interrupt".
120 */ 305 */
121 .macro int_hand vecnum, vecname, c_routine, processing=handle_interrupt 306 .macro __int_hand vecnum, vecname, c_routine,processing=handle_interrupt
122 .org (\vecnum << 8)
123intvec_\vecname: 307intvec_\vecname:
124 /* Temporarily save a register so we have somewhere to work. */ 308 /* Temporarily save a register so we have somewhere to work. */
125 309
126 mtspr SPR_SYSTEM_SAVE_K_1, r0 310 mtspr SPR_SYSTEM_SAVE_K_1, r0
127 mfspr r0, SPR_EX_CONTEXT_K_1 311 mfspr r0, SPR_EX_CONTEXT_K_1
128 312
129 andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */ 313 /*
314 * The unalign data fastpath code sets the low bit in sp to
315 * force us to reset it here on fault.
316 */
317 {
318 blbs sp, 2f
319 andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
320 }
130 321
131 .ifc \vecnum, INT_DOUBLE_FAULT 322 .ifc \vecnum, INT_DOUBLE_FAULT
132 /* 323 /*
@@ -176,7 +367,7 @@ intvec_\vecname:
176 } 367 }
177 .endif 368 .endif
178 369
179 3702:
180 /* 371 /*
181 * SYSTEM_SAVE_K_0 holds the cpu number in the low bits, and 372 * SYSTEM_SAVE_K_0 holds the cpu number in the low bits, and
182 * the current stack top in the higher bits. So we recover 373 * the current stack top in the higher bits. So we recover
@@ -1223,10 +1414,31 @@ STD_ENTRY(_sys_clone)
1223 j sys_clone 1414 j sys_clone
1224 STD_ENDPROC(_sys_clone) 1415 STD_ENDPROC(_sys_clone)
1225 1416
1226/* The single-step support may need to read all the registers. */ 1417 /*
1418 * Recover r3, r2, r1 and r0 here saved by unalign fast vector.
1419 * The vector area limit is 32 bundles, so we handle the reload here.
1420 * r0, r1, r2 are in thread_info from low to high memory in order.
1421 * r3 points to location the original r3 was saved.
1422 * We put this code in the __HEAD section so it can be reached
1423 * via a conditional branch from the fast path.
1424 */
1425 __HEAD
1426hand_unalign_slow:
1427 andi sp, sp, ~1
1428hand_unalign_slow_badsp:
1429 addi r3, r3, -(3 * 8)
1430 ld_add r0, r3, 8
1431 ld_add r1, r3, 8
1432 ld r2, r3
1433hand_unalign_slow_nonuser:
1434 mfspr r3, SPR_SYSTEM_SAVE_K_1
1435 __int_hand INT_UNALIGN_DATA, UNALIGN_DATA_SLOW, int_unalign
1436
1437/* The unaligned data support needs to read all the registers. */
1227int_unalign: 1438int_unalign:
1228 push_extra_callee_saves r0 1439 push_extra_callee_saves r0
1229 j do_trap 1440 j do_unaligned
1441ENDPROC(hand_unalign_slow)
1230 1442
1231/* Fill the return address stack with nonzero entries. */ 1443/* Fill the return address stack with nonzero entries. */
1232STD_ENTRY(fill_ra_stack) 1444STD_ENTRY(fill_ra_stack)
@@ -1240,6 +1452,11 @@ STD_ENTRY(fill_ra_stack)
12404: jrp r0 14524: jrp r0
1241 STD_ENDPROC(fill_ra_stack) 1453 STD_ENDPROC(fill_ra_stack)
1242 1454
1455 .macro int_hand vecnum, vecname, c_routine, processing=handle_interrupt
1456 .org (\vecnum << 8)
1457 __int_hand \vecnum, \vecname, \c_routine, \processing
1458 .endm
1459
1243/* Include .intrpt1 array of interrupt vectors */ 1460/* Include .intrpt1 array of interrupt vectors */
1244 .section ".intrpt1", "ax" 1461 .section ".intrpt1", "ax"
1245 1462
@@ -1272,7 +1489,7 @@ STD_ENTRY(fill_ra_stack)
1272 int_hand INT_SWINT_1, SWINT_1, SYSCALL, handle_syscall 1489 int_hand INT_SWINT_1, SWINT_1, SYSCALL, handle_syscall
1273 int_hand INT_SWINT_0, SWINT_0, do_trap 1490 int_hand INT_SWINT_0, SWINT_0, do_trap
1274 int_hand INT_ILL_TRANS, ILL_TRANS, do_trap 1491 int_hand INT_ILL_TRANS, ILL_TRANS, do_trap
1275 int_hand INT_UNALIGN_DATA, UNALIGN_DATA, int_unalign 1492 int_hand_unalign_fast INT_UNALIGN_DATA, UNALIGN_DATA
1276 int_hand INT_DTLB_MISS, DTLB_MISS, do_page_fault 1493 int_hand INT_DTLB_MISS, DTLB_MISS, do_page_fault
1277 int_hand INT_DTLB_ACCESS, DTLB_ACCESS, do_page_fault 1494 int_hand INT_DTLB_ACCESS, DTLB_ACCESS, do_page_fault
1278 int_hand INT_IDN_FIREWALL, IDN_FIREWALL, do_hardwall_trap 1495 int_hand INT_IDN_FIREWALL, IDN_FIREWALL, do_hardwall_trap