diff options
author | Anton Blanchard <anton@samba.org> | 2015-01-20 20:27:38 -0500 |
---|---|---|
committer | Michael Ellerman <mpe@ellerman.id.au> | 2015-01-22 22:02:55 -0500 |
commit | 15c2d45d17418cc4a712608c78ff3b5f0583d83b (patch) | |
tree | 53e4ee00f5e0b604ee7451ee6e229751043ae0f6 /arch/powerpc/lib | |
parent | a113de373bcb7651196e29a49483c8e24e1e6aa9 (diff) |
powerpc: Add 64bit optimised memcmp
I noticed ksm spending quite a lot of time in memcmp on a large
KVM box. The current memcmp loop is very unoptimised - byte at a
time compares with no loop unrolling. We can do much much better.
Optimise the loop in a few ways:
- Unroll the byte at a time loop
- For large (at least 32 byte) comparisons that are also 8 byte
aligned, use an unrolled modulo scheduled loop using 8 byte
loads. This is similar to our glibc memcmp.
A simple microbenchmark testing 10000000 iterations of an 8192 byte
memcmp was used to measure the performance:
baseline: 29.93 s
modified: 1.70 s
Just over 17x faster.
v2: Incorporated some suggestions from Segher:
- Use andi. instead of rdlicl.
- Convert bdnzt eq, to bdnz. It's just duplicating the earlier compare
and was a relic from a previous version.
- Don't use cr5, we have plans to use that CR field for fast local
atomics.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r-- | arch/powerpc/lib/Makefile | 3 | ||||
-rw-r--r-- | arch/powerpc/lib/memcmp_64.S | 233 | ||||
-rw-r--r-- | arch/powerpc/lib/string.S | 2 |
3 files changed, 237 insertions, 1 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 1b01159b81f3..5526156aae5f 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile | |||
@@ -15,7 +15,8 @@ obj-$(CONFIG_PPC32) += div64.o copy_32.o | |||
15 | 15 | ||
16 | obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \ | 16 | obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \ |
17 | usercopy_64.o mem_64.o hweight_64.o \ | 17 | usercopy_64.o mem_64.o hweight_64.o \ |
18 | copyuser_power7.o string_64.o copypage_power7.o | 18 | copyuser_power7.o string_64.o copypage_power7.o \ |
19 | memcmp_64.o | ||
19 | ifeq ($(CONFIG_GENERIC_CSUM),) | 20 | ifeq ($(CONFIG_GENERIC_CSUM),) |
20 | obj-y += checksum_$(CONFIG_WORD_SIZE).o | 21 | obj-y += checksum_$(CONFIG_WORD_SIZE).o |
21 | obj-$(CONFIG_PPC64) += checksum_wrappers_64.o | 22 | obj-$(CONFIG_PPC64) += checksum_wrappers_64.o |
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S new file mode 100644 index 000000000000..8953d2382a65 --- /dev/null +++ b/arch/powerpc/lib/memcmp_64.S | |||
@@ -0,0 +1,233 @@ | |||
1 | /* | ||
2 | * Author: Anton Blanchard <anton@au.ibm.com> | ||
3 | * Copyright 2015 IBM Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License | ||
7 | * as published by the Free Software Foundation; either version | ||
8 | * 2 of the License, or (at your option) any later version. | ||
9 | */ | ||
10 | #include <asm/ppc_asm.h> | ||
11 | |||
12 | #define off8 r6 | ||
13 | #define off16 r7 | ||
14 | #define off24 r8 | ||
15 | |||
16 | #define rA r9 | ||
17 | #define rB r10 | ||
18 | #define rC r11 | ||
19 | #define rD r27 | ||
20 | #define rE r28 | ||
21 | #define rF r29 | ||
22 | #define rG r30 | ||
23 | #define rH r31 | ||
24 | |||
25 | #ifdef __LITTLE_ENDIAN__ | ||
26 | #define LD ldbrx | ||
27 | #else | ||
28 | #define LD ldx | ||
29 | #endif | ||
30 | |||
31 | _GLOBAL(memcmp) | ||
32 | cmpdi cr1,r5,0 | ||
33 | |||
34 | /* Use the short loop if both strings are not 8B aligned */ | ||
35 | or r6,r3,r4 | ||
36 | andi. r6,r6,7 | ||
37 | |||
38 | /* Use the short loop if length is less than 32B */ | ||
39 | cmpdi cr6,r5,31 | ||
40 | |||
41 | beq cr1,.Lzero | ||
42 | bne .Lshort | ||
43 | bgt cr6,.Llong | ||
44 | |||
45 | .Lshort: | ||
46 | mtctr r5 | ||
47 | |||
48 | 1: lbz rA,0(r3) | ||
49 | lbz rB,0(r4) | ||
50 | subf. rC,rB,rA | ||
51 | bne .Lnon_zero | ||
52 | bdz .Lzero | ||
53 | |||
54 | lbz rA,1(r3) | ||
55 | lbz rB,1(r4) | ||
56 | subf. rC,rB,rA | ||
57 | bne .Lnon_zero | ||
58 | bdz .Lzero | ||
59 | |||
60 | lbz rA,2(r3) | ||
61 | lbz rB,2(r4) | ||
62 | subf. rC,rB,rA | ||
63 | bne .Lnon_zero | ||
64 | bdz .Lzero | ||
65 | |||
66 | lbz rA,3(r3) | ||
67 | lbz rB,3(r4) | ||
68 | subf. rC,rB,rA | ||
69 | bne .Lnon_zero | ||
70 | |||
71 | addi r3,r3,4 | ||
72 | addi r4,r4,4 | ||
73 | |||
74 | bdnz 1b | ||
75 | |||
76 | .Lzero: | ||
77 | li r3,0 | ||
78 | blr | ||
79 | |||
80 | .Lnon_zero: | ||
81 | mr r3,rC | ||
82 | blr | ||
83 | |||
84 | .Llong: | ||
85 | li off8,8 | ||
86 | li off16,16 | ||
87 | li off24,24 | ||
88 | |||
89 | std r31,-8(r1) | ||
90 | std r30,-16(r1) | ||
91 | std r29,-24(r1) | ||
92 | std r28,-32(r1) | ||
93 | std r27,-40(r1) | ||
94 | |||
95 | srdi r0,r5,5 | ||
96 | mtctr r0 | ||
97 | andi. r5,r5,31 | ||
98 | |||
99 | LD rA,0,r3 | ||
100 | LD rB,0,r4 | ||
101 | |||
102 | LD rC,off8,r3 | ||
103 | LD rD,off8,r4 | ||
104 | |||
105 | LD rE,off16,r3 | ||
106 | LD rF,off16,r4 | ||
107 | |||
108 | LD rG,off24,r3 | ||
109 | LD rH,off24,r4 | ||
110 | cmpld cr0,rA,rB | ||
111 | |||
112 | addi r3,r3,32 | ||
113 | addi r4,r4,32 | ||
114 | |||
115 | bdz .Lfirst32 | ||
116 | |||
117 | LD rA,0,r3 | ||
118 | LD rB,0,r4 | ||
119 | cmpld cr1,rC,rD | ||
120 | |||
121 | LD rC,off8,r3 | ||
122 | LD rD,off8,r4 | ||
123 | cmpld cr6,rE,rF | ||
124 | |||
125 | LD rE,off16,r3 | ||
126 | LD rF,off16,r4 | ||
127 | cmpld cr7,rG,rH | ||
128 | bne cr0,.LcmpAB | ||
129 | |||
130 | LD rG,off24,r3 | ||
131 | LD rH,off24,r4 | ||
132 | cmpld cr0,rA,rB | ||
133 | bne cr1,.LcmpCD | ||
134 | |||
135 | addi r3,r3,32 | ||
136 | addi r4,r4,32 | ||
137 | |||
138 | bdz .Lsecond32 | ||
139 | |||
140 | .balign 16 | ||
141 | |||
142 | 1: LD rA,0,r3 | ||
143 | LD rB,0,r4 | ||
144 | cmpld cr1,rC,rD | ||
145 | bne cr6,.LcmpEF | ||
146 | |||
147 | LD rC,off8,r3 | ||
148 | LD rD,off8,r4 | ||
149 | cmpld cr6,rE,rF | ||
150 | bne cr7,.LcmpGH | ||
151 | |||
152 | LD rE,off16,r3 | ||
153 | LD rF,off16,r4 | ||
154 | cmpld cr7,rG,rH | ||
155 | bne cr0,.LcmpAB | ||
156 | |||
157 | LD rG,off24,r3 | ||
158 | LD rH,off24,r4 | ||
159 | cmpld cr0,rA,rB | ||
160 | bne cr1,.LcmpCD | ||
161 | |||
162 | addi r3,r3,32 | ||
163 | addi r4,r4,32 | ||
164 | |||
165 | bdnz 1b | ||
166 | |||
167 | .Lsecond32: | ||
168 | cmpld cr1,rC,rD | ||
169 | bne cr6,.LcmpEF | ||
170 | |||
171 | cmpld cr6,rE,rF | ||
172 | bne cr7,.LcmpGH | ||
173 | |||
174 | cmpld cr7,rG,rH | ||
175 | bne cr0,.LcmpAB | ||
176 | |||
177 | bne cr1,.LcmpCD | ||
178 | bne cr6,.LcmpEF | ||
179 | bne cr7,.LcmpGH | ||
180 | |||
181 | .Ltail: | ||
182 | ld r31,-8(r1) | ||
183 | ld r30,-16(r1) | ||
184 | ld r29,-24(r1) | ||
185 | ld r28,-32(r1) | ||
186 | ld r27,-40(r1) | ||
187 | |||
188 | cmpdi r5,0 | ||
189 | beq .Lzero | ||
190 | b .Lshort | ||
191 | |||
192 | .Lfirst32: | ||
193 | cmpld cr1,rC,rD | ||
194 | cmpld cr6,rE,rF | ||
195 | cmpld cr7,rG,rH | ||
196 | |||
197 | bne cr0,.LcmpAB | ||
198 | bne cr1,.LcmpCD | ||
199 | bne cr6,.LcmpEF | ||
200 | bne cr7,.LcmpGH | ||
201 | |||
202 | b .Ltail | ||
203 | |||
204 | .LcmpAB: | ||
205 | li r3,1 | ||
206 | bgt cr0,.Lout | ||
207 | li r3,-1 | ||
208 | b .Lout | ||
209 | |||
210 | .LcmpCD: | ||
211 | li r3,1 | ||
212 | bgt cr1,.Lout | ||
213 | li r3,-1 | ||
214 | b .Lout | ||
215 | |||
216 | .LcmpEF: | ||
217 | li r3,1 | ||
218 | bgt cr6,.Lout | ||
219 | li r3,-1 | ||
220 | b .Lout | ||
221 | |||
222 | .LcmpGH: | ||
223 | li r3,1 | ||
224 | bgt cr7,.Lout | ||
225 | li r3,-1 | ||
226 | |||
227 | .Lout: | ||
228 | ld r31,-8(r1) | ||
229 | ld r30,-16(r1) | ||
230 | ld r29,-24(r1) | ||
231 | ld r28,-32(r1) | ||
232 | ld r27,-40(r1) | ||
233 | blr | ||
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S index 1b5a0a09d609..c80fb49ce607 100644 --- a/arch/powerpc/lib/string.S +++ b/arch/powerpc/lib/string.S | |||
@@ -93,6 +93,7 @@ _GLOBAL(strlen) | |||
93 | subf r3,r3,r4 | 93 | subf r3,r3,r4 |
94 | blr | 94 | blr |
95 | 95 | ||
96 | #ifdef CONFIG_PPC32 | ||
96 | _GLOBAL(memcmp) | 97 | _GLOBAL(memcmp) |
97 | PPC_LCMPI 0,r5,0 | 98 | PPC_LCMPI 0,r5,0 |
98 | beq- 2f | 99 | beq- 2f |
@@ -106,6 +107,7 @@ _GLOBAL(memcmp) | |||
106 | blr | 107 | blr |
107 | 2: li r3,0 | 108 | 2: li r3,0 |
108 | blr | 109 | blr |
110 | #endif | ||
109 | 111 | ||
110 | _GLOBAL(memchr) | 112 | _GLOBAL(memchr) |
111 | PPC_LCMPI 0,r5,0 | 113 | PPC_LCMPI 0,r5,0 |