aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/lib/string_64.S
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2012-06-04 12:02:22 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2012-07-03 00:14:48 -0400
commitcf8fb5533f35709ba7e31560264b565a9c7a090f (patch)
tree5964991a949f151f59dd427acffe3572c9edb2d1 /arch/powerpc/lib/string_64.S
parentb4c3a8729ae57b4f84d661e16a192f828eca1d03 (diff)
powerpc: Optimise the 64bit optimised __clear_user
I blame Mikey for this. He elevated my slightly dubious testcase: to benchmark status. And naturally we need to be number 1 at creating zeros. So lets improve __clear_user some more. As Paul suggests we can use dcbz for large lengths. This patch gets the destination cacheline aligned then uses dcbz on whole cachelines. Before: 10485760000 bytes (10 GB) copied, 0.414744 s, 25.3 GB/s After: 10485760000 bytes (10 GB) copied, 0.268597 s, 39.0 GB/s 39 GB/s, a new record. Signed-off-by: Anton Blanchard <anton@samba.org> Tested-by: Olof Johansson <olof@lixom.net> Acked-by: Olof Johansson <olof@lixom.net> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/lib/string_64.S')
-rw-r--r--arch/powerpc/lib/string_64.S63
1 files changed, 62 insertions, 1 deletions
diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S
index 6613b9047005..3b1e48049faf 100644
--- a/arch/powerpc/lib/string_64.S
+++ b/arch/powerpc/lib/string_64.S
@@ -19,6 +19,12 @@
19 */ 19 */
20 20
21#include <asm/ppc_asm.h> 21#include <asm/ppc_asm.h>
22#include <asm/asm-offsets.h>
23
24 .section ".toc","aw"
25PPC64_CACHES:
26 .tc ppc64_caches[TC],ppc64_caches
27 .section ".text"
22 28
23/** 29/**
24 * __clear_user: - Zero a block of memory in user space, with less checking. 30 * __clear_user: - Zero a block of memory in user space, with less checking.
@@ -94,9 +100,14 @@ err1; stw r0,0(r3)
94 addi r3,r3,4 100 addi r3,r3,4
95 101
963: sub r4,r4,r6 1023: sub r4,r4,r6
97 srdi r6,r4,5 103
98 cmpdi r4,32 104 cmpdi r4,32
105 cmpdi cr1,r4,512
99 blt .Lshort_clear 106 blt .Lshort_clear
107 bgt cr1,.Llong_clear
108
109.Lmedium_clear:
110 srdi r6,r4,5
100 mtctr r6 111 mtctr r6
101 112
102 /* Do 32 byte chunks */ 113 /* Do 32 byte chunks */
@@ -139,3 +150,53 @@ err1; stb r0,0(r3)
139 150
14010: li r3,0 15110: li r3,0
141 blr 152 blr
153
154.Llong_clear:
155 ld r5,PPC64_CACHES@toc(r2)
156
157 bf cr7*4+0,11f
158err2; std r0,0(r3)
159 addi r3,r3,8
160 addi r4,r4,-8
161
162 /* Destination is 16 byte aligned, need to get it cacheline aligned */
16311: lwz r7,DCACHEL1LOGLINESIZE(r5)
164 lwz r9,DCACHEL1LINESIZE(r5)
165
166 /*
167 * With worst case alignment the long clear loop takes a minimum
168 * of 1 byte less than 2 cachelines.
169 */
170 sldi r10,r9,2
171 cmpd r4,r10
172 blt .Lmedium_clear
173
174 neg r6,r3
175 addi r10,r9,-1
176 and. r5,r6,r10
177 beq 13f
178
179 srdi r6,r5,4
180 mtctr r6
181 mr r8,r3
18212:
183err1; std r0,0(r3)
184err1; std r0,8(r3)
185 addi r3,r3,16
186 bdnz 12b
187
188 sub r4,r4,r5
189
19013: srd r6,r4,r7
191 mtctr r6
192 mr r8,r3
19314:
194err1; dcbz r0,r3
195 add r3,r3,r9
196 bdnz 14b
197
198 and r4,r4,r10
199
200 cmpdi r4,32
201 blt .Lshort_clear
202 b .Lmedium_clear