aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/lib/string.S
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2012-05-27 15:54:03 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2012-07-03 00:14:41 -0400
commit17968fbbd19f1bb281ee4eb2548764ac5664c4ec (patch)
treec6b7a68ea7897e6bf213bffa4795fe209609cba9 /arch/powerpc/lib/string.S
parentd136e27326a3bd50d7929a43c018abf13e426b7e (diff)
powerpc: 64bit optimised __clear_user
I noticed __clear_user high up in a profile of one of my RAID stress tests. The testcase was doing a dd from /dev/zero which ends up calling __clear_user. __clear_user is basically a loop with a single 4 byte store which is horribly slow. We can do much better by aligning the desination and doing 32 bytes of 8 byte stores in a loop. The following testcase was used to verify the patch: http://ozlabs.org/~anton/junkcode/stress_clear_user.c To show the improvement in performance I ran a dd from /dev/zero to /dev/null on a POWER7 box: Before: # dd if=/dev/zero of=/dev/null bs=1M count=10000 10485760000 bytes (10 GB) copied, 3.72379 s, 2.8 GB/s After: # time dd if=/dev/zero of=/dev/null bs=1M count=10000 10485760000 bytes (10 GB) copied, 0.728318 s, 14.4 GB/s Over 5x faster. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/lib/string.S')
-rw-r--r--arch/powerpc/lib/string.S2
1 files changed, 2 insertions, 0 deletions
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 093d6316435c..1b5a0a09d609 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -119,6 +119,7 @@ _GLOBAL(memchr)
1192: li r3,0 1192: li r3,0
120 blr 120 blr
121 121
122#ifdef CONFIG_PPC32
122_GLOBAL(__clear_user) 123_GLOBAL(__clear_user)
123 addi r6,r3,-4 124 addi r6,r3,-4
124 li r3,0 125 li r3,0
@@ -160,3 +161,4 @@ _GLOBAL(__clear_user)
160 PPC_LONG 1b,91b 161 PPC_LONG 1b,91b
161 PPC_LONG 8b,92b 162 PPC_LONG 8b,92b
162 .text 163 .text
164#endif