powerpc: Use enhanced touch instructions in POWER7 copy_to_user/copy_from_user

Version 2.06 of the POWER ISA introduced enhanced touch instructions, allowing us to specify a number of attributes including the length of a stream. This patch adds a software stream for both loads and stores in the POWER7 copy_tofrom_user loop. Since the setup is quite complicated and we have to use an eieio to ensure correct ordering of the "GO" command we only do this for copies above 4kB. To quantify any performance improvements we need a working set bigger than the caches so we operate on a 1GB file: # dd if=/dev/zero of=/tmp/foo bs=1M count=1024 And we compare how fast we can read the file: # dd if=/tmp/foo of=/dev/null bs=1M before: 7.7 GB/s after: 9.6 GB/s A 25% improvement. The worst case for this patch will be a completely L1 cache contained copy of just over 4kB. We can test this with the copy_to_user testcase we used to tune copy_tofrom_user originally: http://ozlabs.org/~anton/junkcode/copy_to_user.c # time ./copy_to_user2 -l 4224 -i 10000000 before: 6.807 s after: 6.946 s A 2% slowdown, which seems reasonable considering our data is unlikely to be completely L1 contained. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
author: Anton Blanchard <anton@samba.org> 2012-05-30 16:19:19 -0400
committer: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2012-07-03 00:14:45 -0400
commit: bce4b4bd91efab9dca693ac37c8ddf88103280d8 (patch)
tree: d6ffb22afdc6a3a1d8482cb226a68865c928837c /arch/powerpc
parent: 8127e723dab6f6e7949da43f87e5f946c4b99cf2 (diff)
1 files changed, 31 insertions, 0 deletions
diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S
index f560f83a3ab0..48e3f8c5768c 100644
--- a/arch/powerpc/lib/copyuser_power7.S
+++ b/arch/powerpc/lib/copyuser_power7.S
@@ -329,6 +329,37 @@ err1;	stb	r0,0(r3)
        dcbt    r0,r8,0b01010   /* GO */
 .machine pop
+        /*
+         * We prefetch both the source and destination using enhanced touch
+         * instructions. We use a stream ID of 0 for the load side and
+         * 1 for the store side.
+         */
+        clrrdi  r6,r4,7
+        clrrdi  r9,r3,7
+        ori     r9,r9,1         /* stream=1 */
+        srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
+        cmpldi  cr1,r7,0x3FF
+        ble     cr1,1f
+        li      r7,0x3FF
+1:      lis     r0,0x0E00       /* depth=7 */
+        sldi    r7,r7,7
+        or      r7,r7,r0
+        ori     r10,r7,1        /* stream=1 */
+        lis     r8,0x8000       /* GO=1 */
+        clrldi  r8,r8,32
+.machine push
+.machine "power4"
+        dcbt    r0,r6,0b01000
+        dcbt    r0,r7,0b01010
+        dcbtst  r0,r9,0b01000
+        dcbtst  r0,r10,0b01010
+        eieio
+        dcbt    r0,r8,0b01010   /* GO */
+.machine pop
        beq     .Lunwind_stack_nonvmx_copy
        /*
author	Anton Blanchard <anton@samba.org>	2012-05-30 16:19:19 -0400
committer	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2012-07-03 00:14:45 -0400
commit	bce4b4bd91efab9dca693ac37c8ddf88103280d8 (patch)
tree	d6ffb22afdc6a3a1d8482cb226a68865c928837c /arch/powerpc
parent	8127e723dab6f6e7949da43f87e5f946c4b99cf2 (diff)

diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S index f560f83a3ab0..48e3f8c5768c 100644 --- a/arch/powerpc/lib/copyuser_power7.S +++ b/arch/powerpc/lib/copyuser_power7.S
@@ -329,6 +329,37 @@ err1; stb r0,0(r3)
329	dcbt r0,r8,0b01010 /* GO */	329	dcbt r0,r8,0b01010 /* GO */
330	.machine pop	330	.machine pop
331		331
		332	/*
		333	* We prefetch both the source and destination using enhanced touch
		334	* instructions. We use a stream ID of 0 for the load side and
		335	* 1 for the store side.
		336	*/
		337	clrrdi r6,r4,7
		338	clrrdi r9,r3,7
		339	ori r9,r9,1 /* stream=1 */
		340
		341	srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
		342	cmpldi cr1,r7,0x3FF
		343	ble cr1,1f
		344	li r7,0x3FF
		345	1: lis r0,0x0E00 /* depth=7 */
		346	sldi r7,r7,7
		347	or r7,r7,r0
		348	ori r10,r7,1 /* stream=1 */
		349
		350	lis r8,0x8000 /* GO=1 */
		351	clrldi r8,r8,32
		352
		353	.machine push
		354	.machine "power4"
		355	dcbt r0,r6,0b01000
		356	dcbt r0,r7,0b01010
		357	dcbtst r0,r9,0b01000
		358	dcbtst r0,r10,0b01010
		359	eieio
		360	dcbt r0,r8,0b01010 /* GO */
		361	.machine pop
		362
332	beq .Lunwind_stack_nonvmx_copy	363	beq .Lunwind_stack_nonvmx_copy
333		364
334	/*	365	/*