diff options
author | Richard Kuo <rkuo@codeaurora.org> | 2011-10-31 19:38:38 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-11-01 10:34:18 -0400 |
commit | c150290df4f97d202d0913ff9cb0898032a803d7 (patch) | |
tree | 8cc890ea53af56abd61a82cafa272185fcd9aa54 /arch/hexagon/lib/memset.S | |
parent | 075a46a049d4ec16925139d69b4473499fd14122 (diff) |
Hexagon: Add memcpy and memset accelerated functions
Signed-off-by: Richard Kuo <rkuo@codeaurora.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/hexagon/lib/memset.S')
-rw-r--r-- | arch/hexagon/lib/memset.S | 315 |
1 files changed, 315 insertions, 0 deletions
diff --git a/arch/hexagon/lib/memset.S b/arch/hexagon/lib/memset.S new file mode 100644 index 000000000000..26d961439ab0 --- /dev/null +++ b/arch/hexagon/lib/memset.S | |||
@@ -0,0 +1,315 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2011 Code Aurora Forum. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 and | ||
6 | * only version 2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program; if not, write to the Free Software | ||
15 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA | ||
16 | * 02110-1301, USA. | ||
17 | */ | ||
18 | |||
19 | |||
20 | /* HEXAGON assembly optimized memset */ | ||
21 | /* Replaces the standard library function memset */ | ||
22 | |||
23 | |||
24 | .macro HEXAGON_OPT_FUNC_BEGIN name | ||
25 | .text | ||
26 | .p2align 4 | ||
27 | .globl \name | ||
28 | .type \name, @function | ||
29 | \name: | ||
30 | .endm | ||
31 | |||
32 | .macro HEXAGON_OPT_FUNC_FINISH name | ||
33 | .size \name, . - \name | ||
34 | .endm | ||
35 | |||
36 | /* FUNCTION: memset (v2 version) */ | ||
37 | #if __HEXAGON_ARCH__ < 3 | ||
38 | HEXAGON_OPT_FUNC_BEGIN memset | ||
39 | { | ||
40 | r6 = #8 | ||
41 | r7 = extractu(r0, #3 , #0) | ||
42 | p0 = cmp.eq(r2, #0) | ||
43 | p1 = cmp.gtu(r2, #7) | ||
44 | } | ||
45 | { | ||
46 | r4 = vsplatb(r1) | ||
47 | r8 = r0 /* leave r0 intact for return val */ | ||
48 | r9 = sub(r6, r7) /* bytes until double alignment */ | ||
49 | if p0 jumpr r31 /* count == 0, so return */ | ||
50 | } | ||
51 | { | ||
52 | r3 = #0 | ||
53 | r7 = #0 | ||
54 | p0 = tstbit(r9, #0) | ||
55 | if p1 jump 2f /* skip byte loop */ | ||
56 | } | ||
57 | |||
58 | /* less than 8 bytes to set, so just set a byte at a time and return */ | ||
59 | |||
60 | loop0(1f, r2) /* byte loop */ | ||
61 | .falign | ||
62 | 1: /* byte loop */ | ||
63 | { | ||
64 | memb(r8++#1) = r4 | ||
65 | }:endloop0 | ||
66 | jumpr r31 | ||
67 | .falign | ||
68 | 2: /* skip byte loop */ | ||
69 | { | ||
70 | r6 = #1 | ||
71 | p0 = tstbit(r9, #1) | ||
72 | p1 = cmp.eq(r2, #1) | ||
73 | if !p0 jump 3f /* skip initial byte store */ | ||
74 | } | ||
75 | { | ||
76 | memb(r8++#1) = r4 | ||
77 | r3:2 = sub(r3:2, r7:6) | ||
78 | if p1 jumpr r31 | ||
79 | } | ||
80 | .falign | ||
81 | 3: /* skip initial byte store */ | ||
82 | { | ||
83 | r6 = #2 | ||
84 | p0 = tstbit(r9, #2) | ||
85 | p1 = cmp.eq(r2, #2) | ||
86 | if !p0 jump 4f /* skip initial half store */ | ||
87 | } | ||
88 | { | ||
89 | memh(r8++#2) = r4 | ||
90 | r3:2 = sub(r3:2, r7:6) | ||
91 | if p1 jumpr r31 | ||
92 | } | ||
93 | .falign | ||
94 | 4: /* skip initial half store */ | ||
95 | { | ||
96 | r6 = #4 | ||
97 | p0 = cmp.gtu(r2, #7) | ||
98 | p1 = cmp.eq(r2, #4) | ||
99 | if !p0 jump 5f /* skip initial word store */ | ||
100 | } | ||
101 | { | ||
102 | memw(r8++#4) = r4 | ||
103 | r3:2 = sub(r3:2, r7:6) | ||
104 | p0 = cmp.gtu(r2, #11) | ||
105 | if p1 jumpr r31 | ||
106 | } | ||
107 | .falign | ||
108 | 5: /* skip initial word store */ | ||
109 | { | ||
110 | r10 = lsr(r2, #3) | ||
111 | p1 = cmp.eq(r3, #1) | ||
112 | if !p0 jump 7f /* skip double loop */ | ||
113 | } | ||
114 | { | ||
115 | r5 = r4 | ||
116 | r6 = #8 | ||
117 | loop0(6f, r10) /* double loop */ | ||
118 | } | ||
119 | |||
120 | /* set bytes a double word at a time */ | ||
121 | |||
122 | .falign | ||
123 | 6: /* double loop */ | ||
124 | { | ||
125 | memd(r8++#8) = r5:4 | ||
126 | r3:2 = sub(r3:2, r7:6) | ||
127 | p1 = cmp.eq(r2, #8) | ||
128 | }:endloop0 | ||
129 | .falign | ||
130 | 7: /* skip double loop */ | ||
131 | { | ||
132 | p0 = tstbit(r2, #2) | ||
133 | if p1 jumpr r31 | ||
134 | } | ||
135 | { | ||
136 | r6 = #4 | ||
137 | p0 = tstbit(r2, #1) | ||
138 | p1 = cmp.eq(r2, #4) | ||
139 | if !p0 jump 8f /* skip final word store */ | ||
140 | } | ||
141 | { | ||
142 | memw(r8++#4) = r4 | ||
143 | r3:2 = sub(r3:2, r7:6) | ||
144 | if p1 jumpr r31 | ||
145 | } | ||
146 | .falign | ||
147 | 8: /* skip final word store */ | ||
148 | { | ||
149 | p1 = cmp.eq(r2, #2) | ||
150 | if !p0 jump 9f /* skip final half store */ | ||
151 | } | ||
152 | { | ||
153 | memh(r8++#2) = r4 | ||
154 | if p1 jumpr r31 | ||
155 | } | ||
156 | .falign | ||
157 | 9: /* skip final half store */ | ||
158 | { | ||
159 | memb(r8++#1) = r4 | ||
160 | jumpr r31 | ||
161 | } | ||
162 | HEXAGON_OPT_FUNC_FINISH memset | ||
163 | #endif | ||
164 | |||
165 | |||
166 | /* FUNCTION: memset (v3 and higher version) */ | ||
167 | #if __HEXAGON_ARCH__ >= 3 | ||
168 | HEXAGON_OPT_FUNC_BEGIN memset | ||
169 | { | ||
170 | r7=vsplatb(r1) | ||
171 | r6 = r0 | ||
172 | if (r2==#0) jump:nt .L1 | ||
173 | } | ||
174 | { | ||
175 | r5:4=combine(r7,r7) | ||
176 | p0 = cmp.gtu(r2,#8) | ||
177 | if (p0.new) jump:nt .L3 | ||
178 | } | ||
179 | { | ||
180 | r3 = r0 | ||
181 | loop0(.L47,r2) | ||
182 | } | ||
183 | .falign | ||
184 | .L47: | ||
185 | { | ||
186 | memb(r3++#1) = r1 | ||
187 | }:endloop0 /* start=.L47 */ | ||
188 | jumpr r31 | ||
189 | .L3: | ||
190 | { | ||
191 | p0 = tstbit(r0,#0) | ||
192 | if (!p0.new) jump:nt .L8 | ||
193 | p1 = cmp.eq(r2, #1) | ||
194 | } | ||
195 | { | ||
196 | r6 = add(r0, #1) | ||
197 | r2 = add(r2,#-1) | ||
198 | memb(r0) = r1 | ||
199 | if (p1) jump .L1 | ||
200 | } | ||
201 | .L8: | ||
202 | { | ||
203 | p0 = tstbit(r6,#1) | ||
204 | if (!p0.new) jump:nt .L10 | ||
205 | } | ||
206 | { | ||
207 | r2 = add(r2,#-2) | ||
208 | memh(r6++#2) = r7 | ||
209 | p0 = cmp.eq(r2, #2) | ||
210 | if (p0.new) jump:nt .L1 | ||
211 | } | ||
212 | .L10: | ||
213 | { | ||
214 | p0 = tstbit(r6,#2) | ||
215 | if (!p0.new) jump:nt .L12 | ||
216 | } | ||
217 | { | ||
218 | r2 = add(r2,#-4) | ||
219 | memw(r6++#4) = r7 | ||
220 | p0 = cmp.eq(r2, #4) | ||
221 | if (p0.new) jump:nt .L1 | ||
222 | } | ||
223 | .L12: | ||
224 | { | ||
225 | p0 = cmp.gtu(r2,#127) | ||
226 | if (!p0.new) jump:nt .L14 | ||
227 | } | ||
228 | r3 = and(r6,#31) | ||
229 | if (r3==#0) jump:nt .L17 | ||
230 | { | ||
231 | memd(r6++#8) = r5:4 | ||
232 | r2 = add(r2,#-8) | ||
233 | } | ||
234 | r3 = and(r6,#31) | ||
235 | if (r3==#0) jump:nt .L17 | ||
236 | { | ||
237 | memd(r6++#8) = r5:4 | ||
238 | r2 = add(r2,#-8) | ||
239 | } | ||
240 | r3 = and(r6,#31) | ||
241 | if (r3==#0) jump:nt .L17 | ||
242 | { | ||
243 | memd(r6++#8) = r5:4 | ||
244 | r2 = add(r2,#-8) | ||
245 | } | ||
246 | .L17: | ||
247 | { | ||
248 | r3 = lsr(r2,#5) | ||
249 | if (r1!=#0) jump:nt .L18 | ||
250 | } | ||
251 | { | ||
252 | r8 = r3 | ||
253 | r3 = r6 | ||
254 | loop0(.L46,r3) | ||
255 | } | ||
256 | .falign | ||
257 | .L46: | ||
258 | { | ||
259 | dczeroa(r6) | ||
260 | r6 = add(r6,#32) | ||
261 | r2 = add(r2,#-32) | ||
262 | }:endloop0 /* start=.L46 */ | ||
263 | .L14: | ||
264 | { | ||
265 | p0 = cmp.gtu(r2,#7) | ||
266 | if (!p0.new) jump:nt .L28 | ||
267 | r8 = lsr(r2,#3) | ||
268 | } | ||
269 | loop0(.L44,r8) | ||
270 | .falign | ||
271 | .L44: | ||
272 | { | ||
273 | memd(r6++#8) = r5:4 | ||
274 | r2 = add(r2,#-8) | ||
275 | }:endloop0 /* start=.L44 */ | ||
276 | .L28: | ||
277 | { | ||
278 | p0 = tstbit(r2,#2) | ||
279 | if (!p0.new) jump:nt .L33 | ||
280 | } | ||
281 | { | ||
282 | r2 = add(r2,#-4) | ||
283 | memw(r6++#4) = r7 | ||
284 | } | ||
285 | .L33: | ||
286 | { | ||
287 | p0 = tstbit(r2,#1) | ||
288 | if (!p0.new) jump:nt .L35 | ||
289 | } | ||
290 | { | ||
291 | r2 = add(r2,#-2) | ||
292 | memh(r6++#2) = r7 | ||
293 | } | ||
294 | .L35: | ||
295 | p0 = cmp.eq(r2,#1) | ||
296 | if (p0) memb(r6) = r1 | ||
297 | .L1: | ||
298 | jumpr r31 | ||
299 | .L18: | ||
300 | loop0(.L45,r3) | ||
301 | .falign | ||
302 | .L45: | ||
303 | dczeroa(r6) | ||
304 | { | ||
305 | memd(r6++#8) = r5:4 | ||
306 | r2 = add(r2,#-32) | ||
307 | } | ||
308 | memd(r6++#8) = r5:4 | ||
309 | memd(r6++#8) = r5:4 | ||
310 | { | ||
311 | memd(r6++#8) = r5:4 | ||
312 | }:endloop0 /* start=.L45 */ | ||
313 | jump .L14 | ||
314 | HEXAGON_OPT_FUNC_FINISH memset | ||
315 | #endif | ||