aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/lib
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2015-01-20 20:27:38 -0500
committerMichael Ellerman <mpe@ellerman.id.au>2015-01-22 22:02:55 -0500
commit15c2d45d17418cc4a712608c78ff3b5f0583d83b (patch)
tree53e4ee00f5e0b604ee7451ee6e229751043ae0f6 /arch/powerpc/lib
parenta113de373bcb7651196e29a49483c8e24e1e6aa9 (diff)
powerpc: Add 64bit optimised memcmp
I noticed ksm spending quite a lot of time in memcmp on a large KVM box. The current memcmp loop is very unoptimised - byte at a time compares with no loop unrolling. We can do much much better. Optimise the loop in a few ways: - Unroll the byte at a time loop - For large (at least 32 byte) comparisons that are also 8 byte aligned, use an unrolled modulo scheduled loop using 8 byte loads. This is similar to our glibc memcmp. A simple microbenchmark testing 10000000 iterations of an 8192 byte memcmp was used to measure the performance: baseline: 29.93 s modified: 1.70 s Just over 17x faster. v2: Incorporated some suggestions from Segher: - Use andi. instead of rdlicl. - Convert bdnzt eq, to bdnz. It's just duplicating the earlier compare and was a relic from a previous version. - Don't use cr5, we have plans to use that CR field for fast local atomics. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r--arch/powerpc/lib/Makefile3
-rw-r--r--arch/powerpc/lib/memcmp_64.S233
-rw-r--r--arch/powerpc/lib/string.S2
3 files changed, 237 insertions, 1 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 1b01159b81f3..5526156aae5f 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -15,7 +15,8 @@ obj-$(CONFIG_PPC32) += div64.o copy_32.o
15 15
16obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \ 16obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \
17 usercopy_64.o mem_64.o hweight_64.o \ 17 usercopy_64.o mem_64.o hweight_64.o \
18 copyuser_power7.o string_64.o copypage_power7.o 18 copyuser_power7.o string_64.o copypage_power7.o \
19 memcmp_64.o
19ifeq ($(CONFIG_GENERIC_CSUM),) 20ifeq ($(CONFIG_GENERIC_CSUM),)
20obj-y += checksum_$(CONFIG_WORD_SIZE).o 21obj-y += checksum_$(CONFIG_WORD_SIZE).o
21obj-$(CONFIG_PPC64) += checksum_wrappers_64.o 22obj-$(CONFIG_PPC64) += checksum_wrappers_64.o
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
new file mode 100644
index 000000000000..8953d2382a65
--- /dev/null
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -0,0 +1,233 @@
1/*
2 * Author: Anton Blanchard <anton@au.ibm.com>
3 * Copyright 2015 IBM Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version
8 * 2 of the License, or (at your option) any later version.
9 */
10#include <asm/ppc_asm.h>
11
12#define off8 r6
13#define off16 r7
14#define off24 r8
15
16#define rA r9
17#define rB r10
18#define rC r11
19#define rD r27
20#define rE r28
21#define rF r29
22#define rG r30
23#define rH r31
24
25#ifdef __LITTLE_ENDIAN__
26#define LD ldbrx
27#else
28#define LD ldx
29#endif
30
31_GLOBAL(memcmp)
32 cmpdi cr1,r5,0
33
34 /* Use the short loop if both strings are not 8B aligned */
35 or r6,r3,r4
36 andi. r6,r6,7
37
38 /* Use the short loop if length is less than 32B */
39 cmpdi cr6,r5,31
40
41 beq cr1,.Lzero
42 bne .Lshort
43 bgt cr6,.Llong
44
45.Lshort:
46 mtctr r5
47
481: lbz rA,0(r3)
49 lbz rB,0(r4)
50 subf. rC,rB,rA
51 bne .Lnon_zero
52 bdz .Lzero
53
54 lbz rA,1(r3)
55 lbz rB,1(r4)
56 subf. rC,rB,rA
57 bne .Lnon_zero
58 bdz .Lzero
59
60 lbz rA,2(r3)
61 lbz rB,2(r4)
62 subf. rC,rB,rA
63 bne .Lnon_zero
64 bdz .Lzero
65
66 lbz rA,3(r3)
67 lbz rB,3(r4)
68 subf. rC,rB,rA
69 bne .Lnon_zero
70
71 addi r3,r3,4
72 addi r4,r4,4
73
74 bdnz 1b
75
76.Lzero:
77 li r3,0
78 blr
79
80.Lnon_zero:
81 mr r3,rC
82 blr
83
84.Llong:
85 li off8,8
86 li off16,16
87 li off24,24
88
89 std r31,-8(r1)
90 std r30,-16(r1)
91 std r29,-24(r1)
92 std r28,-32(r1)
93 std r27,-40(r1)
94
95 srdi r0,r5,5
96 mtctr r0
97 andi. r5,r5,31
98
99 LD rA,0,r3
100 LD rB,0,r4
101
102 LD rC,off8,r3
103 LD rD,off8,r4
104
105 LD rE,off16,r3
106 LD rF,off16,r4
107
108 LD rG,off24,r3
109 LD rH,off24,r4
110 cmpld cr0,rA,rB
111
112 addi r3,r3,32
113 addi r4,r4,32
114
115 bdz .Lfirst32
116
117 LD rA,0,r3
118 LD rB,0,r4
119 cmpld cr1,rC,rD
120
121 LD rC,off8,r3
122 LD rD,off8,r4
123 cmpld cr6,rE,rF
124
125 LD rE,off16,r3
126 LD rF,off16,r4
127 cmpld cr7,rG,rH
128 bne cr0,.LcmpAB
129
130 LD rG,off24,r3
131 LD rH,off24,r4
132 cmpld cr0,rA,rB
133 bne cr1,.LcmpCD
134
135 addi r3,r3,32
136 addi r4,r4,32
137
138 bdz .Lsecond32
139
140 .balign 16
141
1421: LD rA,0,r3
143 LD rB,0,r4
144 cmpld cr1,rC,rD
145 bne cr6,.LcmpEF
146
147 LD rC,off8,r3
148 LD rD,off8,r4
149 cmpld cr6,rE,rF
150 bne cr7,.LcmpGH
151
152 LD rE,off16,r3
153 LD rF,off16,r4
154 cmpld cr7,rG,rH
155 bne cr0,.LcmpAB
156
157 LD rG,off24,r3
158 LD rH,off24,r4
159 cmpld cr0,rA,rB
160 bne cr1,.LcmpCD
161
162 addi r3,r3,32
163 addi r4,r4,32
164
165 bdnz 1b
166
167.Lsecond32:
168 cmpld cr1,rC,rD
169 bne cr6,.LcmpEF
170
171 cmpld cr6,rE,rF
172 bne cr7,.LcmpGH
173
174 cmpld cr7,rG,rH
175 bne cr0,.LcmpAB
176
177 bne cr1,.LcmpCD
178 bne cr6,.LcmpEF
179 bne cr7,.LcmpGH
180
181.Ltail:
182 ld r31,-8(r1)
183 ld r30,-16(r1)
184 ld r29,-24(r1)
185 ld r28,-32(r1)
186 ld r27,-40(r1)
187
188 cmpdi r5,0
189 beq .Lzero
190 b .Lshort
191
192.Lfirst32:
193 cmpld cr1,rC,rD
194 cmpld cr6,rE,rF
195 cmpld cr7,rG,rH
196
197 bne cr0,.LcmpAB
198 bne cr1,.LcmpCD
199 bne cr6,.LcmpEF
200 bne cr7,.LcmpGH
201
202 b .Ltail
203
204.LcmpAB:
205 li r3,1
206 bgt cr0,.Lout
207 li r3,-1
208 b .Lout
209
210.LcmpCD:
211 li r3,1
212 bgt cr1,.Lout
213 li r3,-1
214 b .Lout
215
216.LcmpEF:
217 li r3,1
218 bgt cr6,.Lout
219 li r3,-1
220 b .Lout
221
222.LcmpGH:
223 li r3,1
224 bgt cr7,.Lout
225 li r3,-1
226
227.Lout:
228 ld r31,-8(r1)
229 ld r30,-16(r1)
230 ld r29,-24(r1)
231 ld r28,-32(r1)
232 ld r27,-40(r1)
233 blr
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 1b5a0a09d609..c80fb49ce607 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -93,6 +93,7 @@ _GLOBAL(strlen)
93 subf r3,r3,r4 93 subf r3,r3,r4
94 blr 94 blr
95 95
96#ifdef CONFIG_PPC32
96_GLOBAL(memcmp) 97_GLOBAL(memcmp)
97 PPC_LCMPI 0,r5,0 98 PPC_LCMPI 0,r5,0
98 beq- 2f 99 beq- 2f
@@ -106,6 +107,7 @@ _GLOBAL(memcmp)
106 blr 107 blr
1072: li r3,0 1082: li r3,0
108 blr 109 blr
110#endif
109 111
110_GLOBAL(memchr) 112_GLOBAL(memchr)
111 PPC_LCMPI 0,r5,0 113 PPC_LCMPI 0,r5,0