aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/lib
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2012-05-28 18:14:32 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2012-07-03 00:14:42 -0400
commita9514dc69d5c4f5d6d9e4b8eed40172abd150c61 (patch)
tree5f71e45a5dc49a40cf2172964888db064183b090 /arch/powerpc/lib
parente250d4bca6cb91471e0757179a152c0943ecce4a (diff)
powerpc: Use enhanced touch instructions in POWER7 copy_to_user/copy_from_user
Version 2.06 of the POWER ISA introduced enhanced touch instructions, allowing us to specify a number of attributes including the length of a stream. This patch adds a software stream for both loads and stores in the POWER7 copy_tofrom_user loop. Since the setup is quite complicated and we have to use an eieio to ensure correct ordering of the "GO" command we only do this for copies above 4kB. To quantify any performance improvements we need a working set bigger than the caches so we operate on a 1GB file: # dd if=/dev/zero of=/tmp/foo bs=1M count=1024 And we compare how fast we can read the file: # dd if=/tmp/foo of=/dev/null bs=1M before: 7.7 GB/s after: 9.6 GB/s A 25% improvement. The worst case for this patch will be a completely L1 cache contained copy of just over 4kB. We can test this with the copy_to_user testcase we used to tune copy_tofrom_user originally: http://ozlabs.org/~anton/junkcode/copy_to_user.c # time ./copy_to_user2 -l 4224 -i 10000000 before: 6.807 s after: 6.946 s A 2% slowdown, which seems reasonable considering our data is unlikely to be completely L1 contained. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r--arch/powerpc/lib/copyuser_power7.S31
1 files changed, 31 insertions, 0 deletions
diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S
index 497db7b23bb1..9c982cdec3cf 100644
--- a/arch/powerpc/lib/copyuser_power7.S
+++ b/arch/powerpc/lib/copyuser_power7.S
@@ -298,6 +298,37 @@ err1; stb r0,0(r3)
298 ld r5,STACKFRAMESIZE+64(r1) 298 ld r5,STACKFRAMESIZE+64(r1)
299 mtlr r0 299 mtlr r0
300 300
301 /*
302 * We prefetch both the source and destination using enhanced touch
303 * instructions. We use a stream ID of 0 for the load side and
304 * 1 for the store side.
305 */
306 clrrdi r6,r4,7
307 clrrdi r9,r3,7
308 ori r9,r9,1 /* stream=1 */
309
310 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
311 cmpldi r7,0x3FF
312 ble 1f
313 li r7,0x3FF
3141: lis r0,0x0E00 /* depth=7 */
315 sldi r7,r7,7
316 or r7,r7,r0
317 ori r10,r7,1 /* stream=1 */
318
319 lis r8,0x8000 /* GO=1 */
320 clrldi r8,r8,32
321
322.machine push
323.machine "power4"
324 dcbt r0,r6,0b01000
325 dcbt r0,r7,0b01010
326 dcbtst r0,r9,0b01000
327 dcbtst r0,r10,0b01010
328 eieio
329 dcbt r0,r8,0b01010 /* GO */
330.machine pop
331
301 beq .Lunwind_stack_nonvmx_copy 332 beq .Lunwind_stack_nonvmx_copy
302 333
303 /* 334 /*