aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2012-05-30 16:19:19 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2012-07-03 00:14:45 -0400
commitbce4b4bd91efab9dca693ac37c8ddf88103280d8 (patch)
treed6ffb22afdc6a3a1d8482cb226a68865c928837c /arch/powerpc
parent8127e723dab6f6e7949da43f87e5f946c4b99cf2 (diff)
powerpc: Use enhanced touch instructions in POWER7 copy_to_user/copy_from_user
Version 2.06 of the POWER ISA introduced enhanced touch instructions, allowing us to specify a number of attributes including the length of a stream. This patch adds a software stream for both loads and stores in the POWER7 copy_tofrom_user loop. Since the setup is quite complicated and we have to use an eieio to ensure correct ordering of the "GO" command we only do this for copies above 4kB. To quantify any performance improvements we need a working set bigger than the caches so we operate on a 1GB file: # dd if=/dev/zero of=/tmp/foo bs=1M count=1024 And we compare how fast we can read the file: # dd if=/tmp/foo of=/dev/null bs=1M before: 7.7 GB/s after: 9.6 GB/s A 25% improvement. The worst case for this patch will be a completely L1 cache contained copy of just over 4kB. We can test this with the copy_to_user testcase we used to tune copy_tofrom_user originally: http://ozlabs.org/~anton/junkcode/copy_to_user.c # time ./copy_to_user2 -l 4224 -i 10000000 before: 6.807 s after: 6.946 s A 2% slowdown, which seems reasonable considering our data is unlikely to be completely L1 contained. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc')
-rw-r--r--arch/powerpc/lib/copyuser_power7.S31
1 files changed, 31 insertions, 0 deletions
diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S
index f560f83a3ab0..48e3f8c5768c 100644
--- a/arch/powerpc/lib/copyuser_power7.S
+++ b/arch/powerpc/lib/copyuser_power7.S
@@ -329,6 +329,37 @@ err1; stb r0,0(r3)
329 dcbt r0,r8,0b01010 /* GO */ 329 dcbt r0,r8,0b01010 /* GO */
330.machine pop 330.machine pop
331 331
332 /*
333 * We prefetch both the source and destination using enhanced touch
334 * instructions. We use a stream ID of 0 for the load side and
335 * 1 for the store side.
336 */
337 clrrdi r6,r4,7
338 clrrdi r9,r3,7
339 ori r9,r9,1 /* stream=1 */
340
341 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
342 cmpldi cr1,r7,0x3FF
343 ble cr1,1f
344 li r7,0x3FF
3451: lis r0,0x0E00 /* depth=7 */
346 sldi r7,r7,7
347 or r7,r7,r0
348 ori r10,r7,1 /* stream=1 */
349
350 lis r8,0x8000 /* GO=1 */
351 clrldi r8,r8,32
352
353.machine push
354.machine "power4"
355 dcbt r0,r6,0b01000
356 dcbt r0,r7,0b01010
357 dcbtst r0,r9,0b01000
358 dcbtst r0,r10,0b01010
359 eieio
360 dcbt r0,r8,0b01010 /* GO */
361.machine pop
362
332 beq .Lunwind_stack_nonvmx_copy 363 beq .Lunwind_stack_nonvmx_copy
333 364
334 /* 365 /*