diff options
author | Anton Blanchard <anton@samba.org> | 2012-06-04 12:02:22 -0400 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2012-07-03 00:14:48 -0400 |
commit | cf8fb5533f35709ba7e31560264b565a9c7a090f (patch) | |
tree | 5964991a949f151f59dd427acffe3572c9edb2d1 /arch/powerpc/lib | |
parent | b4c3a8729ae57b4f84d661e16a192f828eca1d03 (diff) |
powerpc: Optimise the 64bit optimised __clear_user
I blame Mikey for this. He elevated my slightly dubious testcase:
to benchmark status. And naturally we need to be number 1 at creating
zeros. So lets improve __clear_user some more.
As Paul suggests we can use dcbz for large lengths. This patch gets
the destination cacheline aligned then uses dcbz on whole cachelines.
Before:
10485760000 bytes (10 GB) copied, 0.414744 s, 25.3 GB/s
After:
10485760000 bytes (10 GB) copied, 0.268597 s, 39.0 GB/s
39 GB/s, a new record.
Signed-off-by: Anton Blanchard <anton@samba.org>
Tested-by: Olof Johansson <olof@lixom.net>
Acked-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r-- | arch/powerpc/lib/string_64.S | 63 |
1 files changed, 62 insertions, 1 deletions
diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S index 6613b9047005..3b1e48049faf 100644 --- a/arch/powerpc/lib/string_64.S +++ b/arch/powerpc/lib/string_64.S | |||
@@ -19,6 +19,12 @@ | |||
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <asm/ppc_asm.h> | 21 | #include <asm/ppc_asm.h> |
22 | #include <asm/asm-offsets.h> | ||
23 | |||
24 | .section ".toc","aw" | ||
25 | PPC64_CACHES: | ||
26 | .tc ppc64_caches[TC],ppc64_caches | ||
27 | .section ".text" | ||
22 | 28 | ||
23 | /** | 29 | /** |
24 | * __clear_user: - Zero a block of memory in user space, with less checking. | 30 | * __clear_user: - Zero a block of memory in user space, with less checking. |
@@ -94,9 +100,14 @@ err1; stw r0,0(r3) | |||
94 | addi r3,r3,4 | 100 | addi r3,r3,4 |
95 | 101 | ||
96 | 3: sub r4,r4,r6 | 102 | 3: sub r4,r4,r6 |
97 | srdi r6,r4,5 | 103 | |
98 | cmpdi r4,32 | 104 | cmpdi r4,32 |
105 | cmpdi cr1,r4,512 | ||
99 | blt .Lshort_clear | 106 | blt .Lshort_clear |
107 | bgt cr1,.Llong_clear | ||
108 | |||
109 | .Lmedium_clear: | ||
110 | srdi r6,r4,5 | ||
100 | mtctr r6 | 111 | mtctr r6 |
101 | 112 | ||
102 | /* Do 32 byte chunks */ | 113 | /* Do 32 byte chunks */ |
@@ -139,3 +150,53 @@ err1; stb r0,0(r3) | |||
139 | 150 | ||
140 | 10: li r3,0 | 151 | 10: li r3,0 |
141 | blr | 152 | blr |
153 | |||
154 | .Llong_clear: | ||
155 | ld r5,PPC64_CACHES@toc(r2) | ||
156 | |||
157 | bf cr7*4+0,11f | ||
158 | err2; std r0,0(r3) | ||
159 | addi r3,r3,8 | ||
160 | addi r4,r4,-8 | ||
161 | |||
162 | /* Destination is 16 byte aligned, need to get it cacheline aligned */ | ||
163 | 11: lwz r7,DCACHEL1LOGLINESIZE(r5) | ||
164 | lwz r9,DCACHEL1LINESIZE(r5) | ||
165 | |||
166 | /* | ||
167 | * With worst case alignment the long clear loop takes a minimum | ||
168 | * of 1 byte less than 2 cachelines. | ||
169 | */ | ||
170 | sldi r10,r9,2 | ||
171 | cmpd r4,r10 | ||
172 | blt .Lmedium_clear | ||
173 | |||
174 | neg r6,r3 | ||
175 | addi r10,r9,-1 | ||
176 | and. r5,r6,r10 | ||
177 | beq 13f | ||
178 | |||
179 | srdi r6,r5,4 | ||
180 | mtctr r6 | ||
181 | mr r8,r3 | ||
182 | 12: | ||
183 | err1; std r0,0(r3) | ||
184 | err1; std r0,8(r3) | ||
185 | addi r3,r3,16 | ||
186 | bdnz 12b | ||
187 | |||
188 | sub r4,r4,r5 | ||
189 | |||
190 | 13: srd r6,r4,r7 | ||
191 | mtctr r6 | ||
192 | mr r8,r3 | ||
193 | 14: | ||
194 | err1; dcbz r0,r3 | ||
195 | add r3,r3,r9 | ||
196 | bdnz 14b | ||
197 | |||
198 | and r4,r4,r10 | ||
199 | |||
200 | cmpdi r4,32 | ||
201 | blt .Lshort_clear | ||
202 | b .Lmedium_clear | ||