diff options
Diffstat (limited to 'arch/sh/lib')
-rw-r--r-- | arch/sh/lib/Makefile | 13 | ||||
-rw-r--r-- | arch/sh/lib/checksum.S | 385 | ||||
-rw-r--r-- | arch/sh/lib/delay.c | 41 | ||||
-rw-r--r-- | arch/sh/lib/div64-generic.c | 19 | ||||
-rw-r--r-- | arch/sh/lib/div64.S | 46 | ||||
-rw-r--r-- | arch/sh/lib/memchr.S | 26 | ||||
-rw-r--r-- | arch/sh/lib/memcpy-sh4.S | 800 | ||||
-rw-r--r-- | arch/sh/lib/memcpy.S | 227 | ||||
-rw-r--r-- | arch/sh/lib/memmove.S | 254 | ||||
-rw-r--r-- | arch/sh/lib/memset.S | 57 | ||||
-rw-r--r-- | arch/sh/lib/strcasecmp.c | 26 | ||||
-rw-r--r-- | arch/sh/lib/strlen.S | 70 | ||||
-rw-r--r-- | arch/sh/lib/udivdi3.c | 16 |
13 files changed, 1980 insertions, 0 deletions
diff --git a/arch/sh/lib/Makefile b/arch/sh/lib/Makefile new file mode 100644 index 000000000000..b5681e3f9684 --- /dev/null +++ b/arch/sh/lib/Makefile | |||
@@ -0,0 +1,13 @@ | |||
1 | # | ||
2 | # Makefile for SuperH-specific library files.. | ||
3 | # | ||
4 | |||
5 | lib-y = delay.o memset.o memmove.o memchr.o \ | ||
6 | checksum.o strcasecmp.o strlen.o div64.o udivdi3.o \ | ||
7 | div64-generic.o | ||
8 | |||
9 | memcpy-y := memcpy.o | ||
10 | memcpy-$(CONFIG_CPU_SH4) := memcpy-sh4.o | ||
11 | |||
12 | lib-y += $(memcpy-y) | ||
13 | |||
diff --git a/arch/sh/lib/checksum.S b/arch/sh/lib/checksum.S new file mode 100644 index 000000000000..7c50dfe68c07 --- /dev/null +++ b/arch/sh/lib/checksum.S | |||
@@ -0,0 +1,385 @@ | |||
1 | /* $Id: checksum.S,v 1.10 2001/07/06 13:11:32 gniibe Exp $ | ||
2 | * | ||
3 | * INET An implementation of the TCP/IP protocol suite for the LINUX | ||
4 | * operating system. INET is implemented using the BSD Socket | ||
5 | * interface as the means of communication with the user level. | ||
6 | * | ||
7 | * IP/TCP/UDP checksumming routines | ||
8 | * | ||
9 | * Authors: Jorge Cwik, <jorge@laser.satlink.net> | ||
10 | * Arnt Gulbrandsen, <agulbra@nvg.unit.no> | ||
11 | * Tom May, <ftom@netcom.com> | ||
12 | * Pentium Pro/II routines: | ||
13 | * Alexander Kjeldaas <astor@guardian.no> | ||
14 | * Finn Arne Gangstad <finnag@guardian.no> | ||
15 | * Lots of code moved from tcp.c and ip.c; see those files | ||
16 | * for more names. | ||
17 | * | ||
18 | * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception | ||
19 | * handling. | ||
20 | * Andi Kleen, add zeroing on error | ||
21 | * converted to pure assembler | ||
22 | * | ||
23 | * SuperH version: Copyright (C) 1999 Niibe Yutaka | ||
24 | * | ||
25 | * This program is free software; you can redistribute it and/or | ||
26 | * modify it under the terms of the GNU General Public License | ||
27 | * as published by the Free Software Foundation; either version | ||
28 | * 2 of the License, or (at your option) any later version. | ||
29 | */ | ||
30 | |||
31 | #include <asm/errno.h> | ||
32 | #include <linux/linkage.h> | ||
33 | |||
34 | /* | ||
35 | * computes a partial checksum, e.g. for TCP/UDP fragments | ||
36 | */ | ||
37 | |||
38 | /* | ||
39 | * unsigned int csum_partial(const unsigned char *buf, int len, | ||
40 | * unsigned int sum); | ||
41 | */ | ||
42 | |||
43 | .text | ||
44 | ENTRY(csum_partial) | ||
45 | /* | ||
46 | * Experiments with Ethernet and SLIP connections show that buff | ||
47 | * is aligned on either a 2-byte or 4-byte boundary. We get at | ||
48 | * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. | ||
49 | * Fortunately, it is easy to convert 2-byte alignment to 4-byte | ||
50 | * alignment for the unrolled loop. | ||
51 | */ | ||
52 | mov r5, r1 | ||
53 | mov r4, r0 | ||
54 | tst #2, r0 ! Check alignment. | ||
55 | bt 2f ! Jump if alignment is ok. | ||
56 | ! | ||
57 | add #-2, r5 ! Alignment uses up two bytes. | ||
58 | cmp/pz r5 ! | ||
59 | bt/s 1f ! Jump if we had at least two bytes. | ||
60 | clrt | ||
61 | bra 6f | ||
62 | add #2, r5 ! r5 was < 2. Deal with it. | ||
63 | 1: | ||
64 | mov r5, r1 ! Save new len for later use. | ||
65 | mov.w @r4+, r0 | ||
66 | extu.w r0, r0 | ||
67 | addc r0, r6 | ||
68 | bf 2f | ||
69 | add #1, r6 | ||
70 | 2: | ||
71 | mov #-5, r0 | ||
72 | shld r0, r5 | ||
73 | tst r5, r5 | ||
74 | bt/s 4f ! if it's =0, go to 4f | ||
75 | clrt | ||
76 | .align 2 | ||
77 | 3: | ||
78 | mov.l @r4+, r0 | ||
79 | mov.l @r4+, r2 | ||
80 | mov.l @r4+, r3 | ||
81 | addc r0, r6 | ||
82 | mov.l @r4+, r0 | ||
83 | addc r2, r6 | ||
84 | mov.l @r4+, r2 | ||
85 | addc r3, r6 | ||
86 | mov.l @r4+, r3 | ||
87 | addc r0, r6 | ||
88 | mov.l @r4+, r0 | ||
89 | addc r2, r6 | ||
90 | mov.l @r4+, r2 | ||
91 | addc r3, r6 | ||
92 | addc r0, r6 | ||
93 | addc r2, r6 | ||
94 | movt r0 | ||
95 | dt r5 | ||
96 | bf/s 3b | ||
97 | cmp/eq #1, r0 | ||
98 | ! here, we know r5==0 | ||
99 | addc r5, r6 ! add carry to r6 | ||
100 | 4: | ||
101 | mov r1, r0 | ||
102 | and #0x1c, r0 | ||
103 | tst r0, r0 | ||
104 | bt/s 6f | ||
105 | mov r0, r5 | ||
106 | shlr2 r5 | ||
107 | mov #0, r2 | ||
108 | 5: | ||
109 | addc r2, r6 | ||
110 | mov.l @r4+, r2 | ||
111 | movt r0 | ||
112 | dt r5 | ||
113 | bf/s 5b | ||
114 | cmp/eq #1, r0 | ||
115 | addc r2, r6 | ||
116 | addc r5, r6 ! r5==0 here, so it means add carry-bit | ||
117 | 6: | ||
118 | mov r1, r5 | ||
119 | mov #3, r0 | ||
120 | and r0, r5 | ||
121 | tst r5, r5 | ||
122 | bt 9f ! if it's =0 go to 9f | ||
123 | mov #2, r1 | ||
124 | cmp/hs r1, r5 | ||
125 | bf 7f | ||
126 | mov.w @r4+, r0 | ||
127 | extu.w r0, r0 | ||
128 | cmp/eq r1, r5 | ||
129 | bt/s 8f | ||
130 | clrt | ||
131 | shll16 r0 | ||
132 | addc r0, r6 | ||
133 | 7: | ||
134 | mov.b @r4+, r0 | ||
135 | extu.b r0, r0 | ||
136 | #ifndef __LITTLE_ENDIAN__ | ||
137 | shll8 r0 | ||
138 | #endif | ||
139 | 8: | ||
140 | addc r0, r6 | ||
141 | mov #0, r0 | ||
142 | addc r0, r6 | ||
143 | 9: | ||
144 | rts | ||
145 | mov r6, r0 | ||
146 | |||
147 | /* | ||
148 | unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, | ||
149 | int sum, int *src_err_ptr, int *dst_err_ptr) | ||
150 | */ | ||
151 | |||
152 | /* | ||
153 | * Copy from ds while checksumming, otherwise like csum_partial | ||
154 | * | ||
155 | * The macros SRC and DST specify the type of access for the instruction. | ||
156 | * thus we can call a custom exception handler for all access types. | ||
157 | * | ||
158 | * FIXME: could someone double-check whether I haven't mixed up some SRC and | ||
159 | * DST definitions? It's damn hard to trigger all cases. I hope I got | ||
160 | * them all but there's no guarantee. | ||
161 | */ | ||
162 | |||
163 | #define SRC(...) \ | ||
164 | 9999: __VA_ARGS__ ; \ | ||
165 | .section __ex_table, "a"; \ | ||
166 | .long 9999b, 6001f ; \ | ||
167 | .previous | ||
168 | |||
169 | #define DST(...) \ | ||
170 | 9999: __VA_ARGS__ ; \ | ||
171 | .section __ex_table, "a"; \ | ||
172 | .long 9999b, 6002f ; \ | ||
173 | .previous | ||
174 | |||
175 | ! | ||
176 | ! r4: const char *SRC | ||
177 | ! r5: char *DST | ||
178 | ! r6: int LEN | ||
179 | ! r7: int SUM | ||
180 | ! | ||
181 | ! on stack: | ||
182 | ! int *SRC_ERR_PTR | ||
183 | ! int *DST_ERR_PTR | ||
184 | ! | ||
185 | ENTRY(csum_partial_copy_generic) | ||
186 | mov.l r5,@-r15 | ||
187 | mov.l r6,@-r15 | ||
188 | |||
189 | mov #3,r0 ! Check src and dest are equally aligned | ||
190 | mov r4,r1 | ||
191 | and r0,r1 | ||
192 | and r5,r0 | ||
193 | cmp/eq r1,r0 | ||
194 | bf 3f ! Different alignments, use slow version | ||
195 | tst #1,r0 ! Check dest word aligned | ||
196 | bf 3f ! If not, do it the slow way | ||
197 | |||
198 | mov #2,r0 | ||
199 | tst r0,r5 ! Check dest alignment. | ||
200 | bt 2f ! Jump if alignment is ok. | ||
201 | add #-2,r6 ! Alignment uses up two bytes. | ||
202 | cmp/pz r6 ! Jump if we had at least two bytes. | ||
203 | bt/s 1f | ||
204 | clrt | ||
205 | bra 4f | ||
206 | add #2,r6 ! r6 was < 2. Deal with it. | ||
207 | |||
208 | 3: ! Handle different src and dest alignments. | ||
209 | ! This is not common, so simple byte by byte copy will do. | ||
210 | mov r6,r2 | ||
211 | shlr r6 | ||
212 | tst r6,r6 | ||
213 | bt 4f | ||
214 | clrt | ||
215 | .align 2 | ||
216 | 5: | ||
217 | SRC( mov.b @r4+,r1 ) | ||
218 | SRC( mov.b @r4+,r0 ) | ||
219 | extu.b r1,r1 | ||
220 | DST( mov.b r1,@r5 ) | ||
221 | DST( mov.b r0,@(1,r5) ) | ||
222 | extu.b r0,r0 | ||
223 | add #2,r5 | ||
224 | |||
225 | #ifdef __LITTLE_ENDIAN__ | ||
226 | shll8 r0 | ||
227 | #else | ||
228 | shll8 r1 | ||
229 | #endif | ||
230 | or r1,r0 | ||
231 | |||
232 | addc r0,r7 | ||
233 | movt r0 | ||
234 | dt r6 | ||
235 | bf/s 5b | ||
236 | cmp/eq #1,r0 | ||
237 | mov #0,r0 | ||
238 | addc r0, r7 | ||
239 | |||
240 | mov r2, r0 | ||
241 | tst #1, r0 | ||
242 | bt 7f | ||
243 | bra 5f | ||
244 | clrt | ||
245 | |||
246 | ! src and dest equally aligned, but to a two byte boundary. | ||
247 | ! Handle first two bytes as a special case | ||
248 | .align 2 | ||
249 | 1: | ||
250 | SRC( mov.w @r4+,r0 ) | ||
251 | DST( mov.w r0,@r5 ) | ||
252 | add #2,r5 | ||
253 | extu.w r0,r0 | ||
254 | addc r0,r7 | ||
255 | mov #0,r0 | ||
256 | addc r0,r7 | ||
257 | 2: | ||
258 | mov r6,r2 | ||
259 | mov #-5,r0 | ||
260 | shld r0,r6 | ||
261 | tst r6,r6 | ||
262 | bt/s 2f | ||
263 | clrt | ||
264 | .align 2 | ||
265 | 1: | ||
266 | SRC( mov.l @r4+,r0 ) | ||
267 | SRC( mov.l @r4+,r1 ) | ||
268 | addc r0,r7 | ||
269 | DST( mov.l r0,@r5 ) | ||
270 | DST( mov.l r1,@(4,r5) ) | ||
271 | addc r1,r7 | ||
272 | |||
273 | SRC( mov.l @r4+,r0 ) | ||
274 | SRC( mov.l @r4+,r1 ) | ||
275 | addc r0,r7 | ||
276 | DST( mov.l r0,@(8,r5) ) | ||
277 | DST( mov.l r1,@(12,r5) ) | ||
278 | addc r1,r7 | ||
279 | |||
280 | SRC( mov.l @r4+,r0 ) | ||
281 | SRC( mov.l @r4+,r1 ) | ||
282 | addc r0,r7 | ||
283 | DST( mov.l r0,@(16,r5) ) | ||
284 | DST( mov.l r1,@(20,r5) ) | ||
285 | addc r1,r7 | ||
286 | |||
287 | SRC( mov.l @r4+,r0 ) | ||
288 | SRC( mov.l @r4+,r1 ) | ||
289 | addc r0,r7 | ||
290 | DST( mov.l r0,@(24,r5) ) | ||
291 | DST( mov.l r1,@(28,r5) ) | ||
292 | addc r1,r7 | ||
293 | add #32,r5 | ||
294 | movt r0 | ||
295 | dt r6 | ||
296 | bf/s 1b | ||
297 | cmp/eq #1,r0 | ||
298 | mov #0,r0 | ||
299 | addc r0,r7 | ||
300 | |||
301 | 2: mov r2,r6 | ||
302 | mov #0x1c,r0 | ||
303 | and r0,r6 | ||
304 | cmp/pl r6 | ||
305 | bf/s 4f | ||
306 | clrt | ||
307 | shlr2 r6 | ||
308 | 3: | ||
309 | SRC( mov.l @r4+,r0 ) | ||
310 | addc r0,r7 | ||
311 | DST( mov.l r0,@r5 ) | ||
312 | add #4,r5 | ||
313 | movt r0 | ||
314 | dt r6 | ||
315 | bf/s 3b | ||
316 | cmp/eq #1,r0 | ||
317 | mov #0,r0 | ||
318 | addc r0,r7 | ||
319 | 4: mov r2,r6 | ||
320 | mov #3,r0 | ||
321 | and r0,r6 | ||
322 | cmp/pl r6 | ||
323 | bf 7f | ||
324 | mov #2,r1 | ||
325 | cmp/hs r1,r6 | ||
326 | bf 5f | ||
327 | SRC( mov.w @r4+,r0 ) | ||
328 | DST( mov.w r0,@r5 ) | ||
329 | extu.w r0,r0 | ||
330 | add #2,r5 | ||
331 | cmp/eq r1,r6 | ||
332 | bt/s 6f | ||
333 | clrt | ||
334 | shll16 r0 | ||
335 | addc r0,r7 | ||
336 | 5: | ||
337 | SRC( mov.b @r4+,r0 ) | ||
338 | DST( mov.b r0,@r5 ) | ||
339 | extu.b r0,r0 | ||
340 | #ifndef __LITTLE_ENDIAN__ | ||
341 | shll8 r0 | ||
342 | #endif | ||
343 | 6: addc r0,r7 | ||
344 | mov #0,r0 | ||
345 | addc r0,r7 | ||
346 | 7: | ||
347 | 5000: | ||
348 | |||
349 | # Exception handler: | ||
350 | .section .fixup, "ax" | ||
351 | |||
352 | 6001: | ||
353 | mov.l @(8,r15),r0 ! src_err_ptr | ||
354 | mov #-EFAULT,r1 | ||
355 | mov.l r1,@r0 | ||
356 | |||
357 | ! zero the complete destination - computing the rest | ||
358 | ! is too much work | ||
359 | mov.l @(4,r15),r5 ! dst | ||
360 | mov.l @r15,r6 ! len | ||
361 | mov #0,r7 | ||
362 | 1: mov.b r7,@r5 | ||
363 | dt r6 | ||
364 | bf/s 1b | ||
365 | add #1,r5 | ||
366 | mov.l 8000f,r0 | ||
367 | jmp @r0 | ||
368 | nop | ||
369 | .align 2 | ||
370 | 8000: .long 5000b | ||
371 | |||
372 | 6002: | ||
373 | mov.l @(12,r15),r0 ! dst_err_ptr | ||
374 | mov #-EFAULT,r1 | ||
375 | mov.l r1,@r0 | ||
376 | mov.l 8001f,r0 | ||
377 | jmp @r0 | ||
378 | nop | ||
379 | .align 2 | ||
380 | 8001: .long 5000b | ||
381 | |||
382 | .previous | ||
383 | add #8,r15 | ||
384 | rts | ||
385 | mov r7,r0 | ||
diff --git a/arch/sh/lib/delay.c b/arch/sh/lib/delay.c new file mode 100644 index 000000000000..50b36037d86b --- /dev/null +++ b/arch/sh/lib/delay.c | |||
@@ -0,0 +1,41 @@ | |||
1 | /* | ||
2 | * Precise Delay Loops for SuperH | ||
3 | * | ||
4 | * Copyright (C) 1999 Niibe Yutaka & Kaz Kojima | ||
5 | */ | ||
6 | |||
7 | #include <linux/sched.h> | ||
8 | #include <linux/delay.h> | ||
9 | |||
10 | void __delay(unsigned long loops) | ||
11 | { | ||
12 | __asm__ __volatile__( | ||
13 | "tst %0, %0\n\t" | ||
14 | "1:\t" | ||
15 | "bf/s 1b\n\t" | ||
16 | " dt %0" | ||
17 | : "=r" (loops) | ||
18 | : "0" (loops) | ||
19 | : "t"); | ||
20 | } | ||
21 | |||
22 | inline void __const_udelay(unsigned long xloops) | ||
23 | { | ||
24 | __asm__("dmulu.l %0, %2\n\t" | ||
25 | "sts mach, %0" | ||
26 | : "=r" (xloops) | ||
27 | : "0" (xloops), "r" (cpu_data[_smp_processor_id()].loops_per_jiffy) | ||
28 | : "macl", "mach"); | ||
29 | __delay(xloops * HZ); | ||
30 | } | ||
31 | |||
32 | void __udelay(unsigned long usecs) | ||
33 | { | ||
34 | __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */ | ||
35 | } | ||
36 | |||
37 | void __ndelay(unsigned long nsecs) | ||
38 | { | ||
39 | __const_udelay(nsecs * 0x00000005); | ||
40 | } | ||
41 | |||
diff --git a/arch/sh/lib/div64-generic.c b/arch/sh/lib/div64-generic.c new file mode 100644 index 000000000000..c02473afd581 --- /dev/null +++ b/arch/sh/lib/div64-generic.c | |||
@@ -0,0 +1,19 @@ | |||
1 | /* | ||
2 | * Generic __div64_32 wrapper for __xdiv64_32. | ||
3 | */ | ||
4 | |||
5 | #include <linux/types.h> | ||
6 | |||
7 | extern u64 __xdiv64_32(u64 n, u32 d); | ||
8 | |||
9 | u64 __div64_32(u64 *xp, u32 y) | ||
10 | { | ||
11 | u64 rem; | ||
12 | u64 q = __xdiv64_32(*xp, y); | ||
13 | |||
14 | rem = *xp - q * y; | ||
15 | *xp = q; | ||
16 | |||
17 | return rem; | ||
18 | } | ||
19 | |||
diff --git a/arch/sh/lib/div64.S b/arch/sh/lib/div64.S new file mode 100644 index 000000000000..eefc275d64a7 --- /dev/null +++ b/arch/sh/lib/div64.S | |||
@@ -0,0 +1,46 @@ | |||
1 | /* | ||
2 | * unsigned long long __xdiv64_32(unsigned long long n, unsigned long d); | ||
3 | */ | ||
4 | |||
5 | #include <linux/linkage.h> | ||
6 | |||
7 | .text | ||
8 | ENTRY(__xdiv64_32) | ||
9 | #ifdef __LITTLE_ENDIAN__ | ||
10 | mov r4, r0 | ||
11 | mov r5, r1 | ||
12 | #else | ||
13 | mov r4, r1 | ||
14 | mov r5, r0 | ||
15 | #endif | ||
16 | cmp/hs r6, r1 | ||
17 | bf.s 1f | ||
18 | mov #0, r2 | ||
19 | |||
20 | mov r1, r2 | ||
21 | mov #0, r3 | ||
22 | div0u | ||
23 | .rept 32 | ||
24 | rotcl r2 | ||
25 | div1 r6, r3 | ||
26 | .endr | ||
27 | rotcl r2 | ||
28 | mul.l r6, r2 | ||
29 | sts macl, r3 | ||
30 | sub r3, r1 | ||
31 | 1: | ||
32 | div0u | ||
33 | .rept 32 | ||
34 | rotcl r0 | ||
35 | div1 r6, r1 | ||
36 | .endr | ||
37 | #ifdef __LITTLE_ENDIAN__ | ||
38 | mov r2, r1 | ||
39 | rts | ||
40 | rotcl r0 | ||
41 | #else | ||
42 | rotcl r0 | ||
43 | mov r0, r1 | ||
44 | rts | ||
45 | mov r2, r0 | ||
46 | #endif | ||
diff --git a/arch/sh/lib/memchr.S b/arch/sh/lib/memchr.S new file mode 100644 index 000000000000..bc6036ad5706 --- /dev/null +++ b/arch/sh/lib/memchr.S | |||
@@ -0,0 +1,26 @@ | |||
1 | /* $Id: memchr.S,v 1.1 2000/04/14 16:49:01 mjd Exp $ | ||
2 | * | ||
3 | * "memchr" implementation of SuperH | ||
4 | * | ||
5 | * Copyright (C) 1999 Niibe Yutaka | ||
6 | * | ||
7 | */ | ||
8 | |||
9 | /* | ||
10 | * void *memchr(const void *s, int c, size_t n); | ||
11 | */ | ||
12 | |||
13 | #include <linux/linkage.h> | ||
14 | ENTRY(memchr) | ||
15 | tst r6,r6 | ||
16 | bt/s 2f | ||
17 | exts.b r5,r5 | ||
18 | 1: mov.b @r4,r1 | ||
19 | cmp/eq r1,r5 | ||
20 | bt/s 3f | ||
21 | dt r6 | ||
22 | bf/s 1b | ||
23 | add #1,r4 | ||
24 | 2: mov #0,r4 | ||
25 | 3: rts | ||
26 | mov r4,r0 | ||
diff --git a/arch/sh/lib/memcpy-sh4.S b/arch/sh/lib/memcpy-sh4.S new file mode 100644 index 000000000000..55f227441f9e --- /dev/null +++ b/arch/sh/lib/memcpy-sh4.S | |||
@@ -0,0 +1,800 @@ | |||
1 | /* | ||
2 | * "memcpy" implementation of SuperH | ||
3 | * | ||
4 | * Copyright (C) 1999 Niibe Yutaka | ||
5 | * Copyright (c) 2002 STMicroelectronics Ltd | ||
6 | * Modified from memcpy.S and micro-optimised for SH4 | ||
7 | * Stuart Menefy (stuart.menefy@st.com) | ||
8 | * | ||
9 | */ | ||
10 | #include <linux/linkage.h> | ||
11 | #include <linux/config.h> | ||
12 | |||
13 | /* | ||
14 | * void *memcpy(void *dst, const void *src, size_t n); | ||
15 | * | ||
16 | * It is assumed that there is no overlap between src and dst. | ||
17 | * If there is an overlap, then the results are undefined. | ||
18 | */ | ||
19 | |||
20 | ! | ||
21 | ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. | ||
22 | ! | ||
23 | |||
24 | ! Size is 16 or greater, and may have trailing bytes | ||
25 | |||
26 | .balign 32 | ||
27 | .Lcase1: | ||
28 | ! Read a long word and write a long word at once | ||
29 | ! At the start of each iteration, r7 contains last long load | ||
30 | add #-1,r5 ! 79 EX | ||
31 | mov r4,r2 ! 5 MT (0 cycles latency) | ||
32 | |||
33 | mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) | ||
34 | add #-4,r5 ! 50 EX | ||
35 | |||
36 | add #7,r2 ! 79 EX | ||
37 | ! | ||
38 | #ifdef CONFIG_CPU_LITTLE_ENDIAN | ||
39 | ! 6 cycles, 4 bytes per iteration | ||
40 | 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK | ||
41 | mov r7, r3 ! 5 MT (latency=0) ! RQPO | ||
42 | |||
43 | cmp/hi r2,r0 ! 57 MT | ||
44 | shll16 r3 ! 103 EX | ||
45 | |||
46 | mov r1,r6 ! 5 MT (latency=0) | ||
47 | shll8 r3 ! 102 EX ! Oxxx | ||
48 | |||
49 | shlr8 r6 ! 106 EX ! xNML | ||
50 | mov r1, r7 ! 5 MT (latency=0) | ||
51 | |||
52 | or r6,r3 ! 82 EX ! ONML | ||
53 | bt/s 3b ! 109 BR | ||
54 | |||
55 | mov.l r3,@-r0 ! 30 LS | ||
56 | #else | ||
57 | 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN | ||
58 | mov r7,r3 ! 5 MT (latency=0) ! OPQR | ||
59 | |||
60 | cmp/hi r2,r0 ! 57 MT | ||
61 | shlr16 r3 ! 107 EX | ||
62 | |||
63 | shlr8 r3 ! 106 EX ! xxxO | ||
64 | mov r1,r6 ! 5 MT (latency=0) | ||
65 | |||
66 | shll8 r6 ! 102 EX ! LMNx | ||
67 | mov r1,r7 ! 5 MT (latency=0) | ||
68 | |||
69 | or r6,r3 ! 82 EX ! LMNO | ||
70 | bt/s 3b ! 109 BR | ||
71 | |||
72 | mov.l r3,@-r0 ! 30 LS | ||
73 | #endif | ||
74 | ! Finally, copy a byte at once, if necessary | ||
75 | |||
76 | add #4,r5 ! 50 EX | ||
77 | cmp/eq r4,r0 ! 54 MT | ||
78 | |||
79 | add #-6,r2 ! 50 EX | ||
80 | bt 9f ! 109 BR | ||
81 | |||
82 | 8: cmp/hi r2,r0 ! 57 MT | ||
83 | mov.b @(r0,r5),r1 ! 20 LS (latency=2) | ||
84 | |||
85 | bt/s 8b ! 109 BR | ||
86 | |||
87 | mov.b r1,@-r0 ! 29 LS | ||
88 | |||
89 | 9: rts | ||
90 | nop | ||
91 | |||
92 | |||
93 | ! | ||
94 | ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... | ||
95 | ! | ||
96 | |||
97 | ! Size is 16 or greater, and may have trailing bytes | ||
98 | |||
99 | .balign 32 | ||
100 | .Lcase3: | ||
101 | ! Read a long word and write a long word at once | ||
102 | ! At the start of each iteration, r7 contains last long load | ||
103 | add #-3,r5 ! 79 EX | ||
104 | mov r4,r2 ! 5 MT (0 cycles latency) | ||
105 | |||
106 | mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) | ||
107 | add #-4,r5 ! 50 EX | ||
108 | |||
109 | add #7,r2 ! 79 EX | ||
110 | ! | ||
111 | #ifdef CONFIG_CPU_LITTLE_ENDIAN | ||
112 | ! 6 cycles, 4 bytes per iteration | ||
113 | 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK | ||
114 | mov r7, r3 ! 5 MT (latency=0) ! RQPO | ||
115 | |||
116 | cmp/hi r2,r0 ! 57 MT | ||
117 | shll8 r3 ! 102 EX ! QPOx | ||
118 | |||
119 | mov r1,r6 ! 5 MT (latency=0) | ||
120 | shlr16 r6 ! 107 EX | ||
121 | |||
122 | shlr8 r6 ! 106 EX ! xxxN | ||
123 | mov r1, r7 ! 5 MT (latency=0) | ||
124 | |||
125 | or r6,r3 ! 82 EX ! QPON | ||
126 | bt/s 3b ! 109 BR | ||
127 | |||
128 | mov.l r3,@-r0 ! 30 LS | ||
129 | #else | ||
130 | 3: mov r1,r3 ! OPQR | ||
131 | shlr8 r3 ! xOPQ | ||
132 | mov.l @(r0,r5),r1 ! KLMN | ||
133 | mov r1,r6 | ||
134 | shll16 r6 | ||
135 | shll8 r6 ! Nxxx | ||
136 | or r6,r3 ! NOPQ | ||
137 | cmp/hi r2,r0 | ||
138 | bt/s 3b | ||
139 | mov.l r3,@-r0 | ||
140 | #endif | ||
141 | |||
142 | ! Finally, copy a byte at once, if necessary | ||
143 | |||
144 | add #6,r5 ! 50 EX | ||
145 | cmp/eq r4,r0 ! 54 MT | ||
146 | |||
147 | add #-6,r2 ! 50 EX | ||
148 | bt 9f ! 109 BR | ||
149 | |||
150 | 8: cmp/hi r2,r0 ! 57 MT | ||
151 | mov.b @(r0,r5),r1 ! 20 LS (latency=2) | ||
152 | |||
153 | bt/s 8b ! 109 BR | ||
154 | |||
155 | mov.b r1,@-r0 ! 29 LS | ||
156 | |||
157 | 9: rts | ||
158 | nop | ||
159 | |||
160 | ENTRY(memcpy) | ||
161 | |||
162 | ! Calculate the invariants which will be used in the remainder | ||
163 | ! of the code: | ||
164 | ! | ||
165 | ! r4 --> [ ... ] DST [ ... ] SRC | ||
166 | ! [ ... ] [ ... ] | ||
167 | ! : : | ||
168 | ! r0 --> [ ... ] r0+r5 --> [ ... ] | ||
169 | ! | ||
170 | ! | ||
171 | |||
172 | ! Short circuit the common case of src, dst and len being 32 bit aligned | ||
173 | ! and test for zero length move | ||
174 | |||
175 | mov r6, r0 ! 5 MT (0 cycle latency) | ||
176 | or r4, r0 ! 82 EX | ||
177 | |||
178 | or r5, r0 ! 82 EX | ||
179 | tst r6, r6 ! 86 MT | ||
180 | |||
181 | bt/s 99f ! 111 BR (zero len) | ||
182 | tst #3, r0 ! 87 MT | ||
183 | |||
184 | mov r4, r0 ! 5 MT (0 cycle latency) | ||
185 | add r6, r0 ! 49 EX | ||
186 | |||
187 | mov #16, r1 ! 6 EX | ||
188 | bt/s .Lcase00 ! 111 BR (aligned) | ||
189 | |||
190 | sub r4, r5 ! 75 EX | ||
191 | |||
192 | ! Arguments are not nicely long word aligned or zero len. | ||
193 | ! Check for small copies, and if so do a simple byte at a time copy. | ||
194 | ! | ||
195 | ! Deciding on an exact value of 'small' is not easy, as the point at which | ||
196 | ! using the optimised routines become worthwhile varies (these are the | ||
197 | ! cycle counts for differnet sizes using byte-at-a-time vs. optimised): | ||
198 | ! size byte-at-time long word byte | ||
199 | ! 16 42 39-40 46-50 50-55 | ||
200 | ! 24 58 43-44 54-58 62-67 | ||
201 | ! 36 82 49-50 66-70 80-85 | ||
202 | ! However the penalty for getting it 'wrong' is much higher for long word | ||
203 | ! aligned data (and this is more common), so use a value of 16. | ||
204 | |||
205 | cmp/gt r6,r1 ! 56 MT | ||
206 | |||
207 | add #-1,r5 ! 50 EX | ||
208 | bf/s 6f ! 108 BR (not small) | ||
209 | |||
210 | mov r5, r3 ! 5 MT (latency=0) | ||
211 | shlr r6 ! 104 EX | ||
212 | |||
213 | mov.b @(r0,r5),r1 ! 20 LS (latency=2) | ||
214 | bf/s 4f ! 111 BR | ||
215 | |||
216 | add #-1,r3 ! 50 EX | ||
217 | tst r6, r6 ! 86 MT | ||
218 | |||
219 | bt/s 98f ! 110 BR | ||
220 | mov.b r1,@-r0 ! 29 LS | ||
221 | |||
222 | ! 4 cycles, 2 bytes per iteration | ||
223 | 3: mov.b @(r0,r5),r1 ! 20 LS (latency=2) | ||
224 | |||
225 | 4: mov.b @(r0,r3),r2 ! 20 LS (latency=2) | ||
226 | dt r6 ! 67 EX | ||
227 | |||
228 | mov.b r1,@-r0 ! 29 LS | ||
229 | bf/s 3b ! 111 BR | ||
230 | |||
231 | mov.b r2,@-r0 ! 29 LS | ||
232 | 98: | ||
233 | rts | ||
234 | nop | ||
235 | |||
236 | 99: rts | ||
237 | mov r4, r0 | ||
238 | |||
239 | ! Size is not small, so its worthwhile looking for optimisations. | ||
240 | ! First align destination to a long word boundary. | ||
241 | ! | ||
242 | ! r5 = normal value -1 | ||
243 | |||
244 | 6: tst #3, r0 ! 87 MT | ||
245 | mov #3, r3 ! 6 EX | ||
246 | |||
247 | bt/s 2f ! 111 BR | ||
248 | and r0,r3 ! 78 EX | ||
249 | |||
250 | ! 3 cycles, 1 byte per iteration | ||
251 | 1: dt r3 ! 67 EX | ||
252 | mov.b @(r0,r5),r1 ! 19 LS (latency=2) | ||
253 | |||
254 | add #-1, r6 ! 79 EX | ||
255 | bf/s 1b ! 109 BR | ||
256 | |||
257 | mov.b r1,@-r0 ! 28 LS | ||
258 | |||
259 | 2: add #1, r5 ! 79 EX | ||
260 | |||
261 | ! Now select the appropriate bulk transfer code based on relative | ||
262 | ! alignment of src and dst. | ||
263 | |||
264 | mov r0, r3 ! 5 MT (latency=0) | ||
265 | |||
266 | mov r5, r0 ! 5 MT (latency=0) | ||
267 | tst #1, r0 ! 87 MT | ||
268 | |||
269 | bf/s 1f ! 111 BR | ||
270 | mov #64, r7 ! 6 EX | ||
271 | |||
272 | ! bit 0 clear | ||
273 | |||
274 | cmp/ge r7, r6 ! 55 MT | ||
275 | |||
276 | bt/s 2f ! 111 BR | ||
277 | tst #2, r0 ! 87 MT | ||
278 | |||
279 | ! small | ||
280 | bt/s .Lcase0 | ||
281 | mov r3, r0 | ||
282 | |||
283 | bra .Lcase2 | ||
284 | nop | ||
285 | |||
286 | ! big | ||
287 | 2: bt/s .Lcase0b | ||
288 | mov r3, r0 | ||
289 | |||
290 | bra .Lcase2b | ||
291 | nop | ||
292 | |||
293 | ! bit 0 set | ||
294 | 1: tst #2, r0 ! 87 MT | ||
295 | |||
296 | bt/s .Lcase1 | ||
297 | mov r3, r0 | ||
298 | |||
299 | bra .Lcase3 | ||
300 | nop | ||
301 | |||
302 | |||
303 | ! | ||
304 | ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR | ||
305 | ! | ||
306 | |||
307 | ! src, dst and size are all long word aligned | ||
308 | ! size is non-zero | ||
309 | |||
310 | .balign 32 | ||
311 | .Lcase00: | ||
312 | mov #64, r1 ! 6 EX | ||
313 | mov r5, r3 ! 5 MT (latency=0) | ||
314 | |||
315 | cmp/gt r6, r1 ! 56 MT | ||
316 | add #-4, r5 ! 50 EX | ||
317 | |||
318 | bf .Lcase00b ! 108 BR (big loop) | ||
319 | shlr2 r6 ! 105 EX | ||
320 | |||
321 | shlr r6 ! 104 EX | ||
322 | mov.l @(r0, r5), r1 ! 21 LS (latency=2) | ||
323 | |||
324 | bf/s 4f ! 111 BR | ||
325 | add #-8, r3 ! 50 EX | ||
326 | |||
327 | tst r6, r6 ! 86 MT | ||
328 | bt/s 5f ! 110 BR | ||
329 | |||
330 | mov.l r1,@-r0 ! 30 LS | ||
331 | |||
332 | ! 4 cycles, 2 long words per iteration | ||
333 | 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) | ||
334 | |||
335 | 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) | ||
336 | dt r6 ! 67 EX | ||
337 | |||
338 | mov.l r1, @-r0 ! 30 LS | ||
339 | bf/s 3b ! 109 BR | ||
340 | |||
341 | mov.l r2, @-r0 ! 30 LS | ||
342 | |||
343 | 5: rts | ||
344 | nop | ||
345 | |||
346 | |||
347 | ! Size is 16 or greater and less than 64, but may have trailing bytes | ||
348 | |||
349 | .balign 32 | ||
350 | .Lcase0: | ||
351 | add #-4, r5 ! 50 EX | ||
352 | mov r4, r7 ! 5 MT (latency=0) | ||
353 | |||
354 | mov.l @(r0, r5), r1 ! 21 LS (latency=2) | ||
355 | mov #4, r2 ! 6 EX | ||
356 | |||
357 | add #11, r7 ! 50 EX | ||
358 | tst r2, r6 ! 86 MT | ||
359 | |||
360 | mov r5, r3 ! 5 MT (latency=0) | ||
361 | bt/s 4f ! 111 BR | ||
362 | |||
363 | add #-4, r3 ! 50 EX | ||
364 | mov.l r1,@-r0 ! 30 LS | ||
365 | |||
366 | ! 4 cycles, 2 long words per iteration | ||
367 | 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) | ||
368 | |||
369 | 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) | ||
370 | cmp/hi r7, r0 | ||
371 | |||
372 | mov.l r1, @-r0 ! 30 LS | ||
373 | bt/s 3b ! 109 BR | ||
374 | |||
375 | mov.l r2, @-r0 ! 30 LS | ||
376 | |||
377 | ! Copy the final 0-3 bytes | ||
378 | |||
379 | add #3,r5 ! 50 EX | ||
380 | |||
381 | cmp/eq r0, r4 ! 54 MT | ||
382 | add #-10, r7 ! 50 EX | ||
383 | |||
384 | bt 9f ! 110 BR | ||
385 | |||
386 | ! 3 cycles, 1 byte per iteration | ||
387 | 1: mov.b @(r0,r5),r1 ! 19 LS | ||
388 | cmp/hi r7,r0 ! 57 MT | ||
389 | |||
390 | bt/s 1b ! 111 BR | ||
391 | mov.b r1,@-r0 ! 28 LS | ||
392 | |||
393 | 9: rts | ||
394 | nop | ||
395 | |||
396 | ! Size is at least 64 bytes, so will be going round the big loop at least once. | ||
397 | ! | ||
398 | ! r2 = rounded up r4 | ||
399 | ! r3 = rounded down r0 | ||
400 | |||
401 | .balign 32 | ||
402 | .Lcase0b: | ||
403 | add #-4, r5 ! 50 EX | ||
404 | |||
405 | .Lcase00b: | ||
406 | mov r0, r3 ! 5 MT (latency=0) | ||
407 | mov #(~0x1f), r1 ! 6 EX | ||
408 | |||
409 | and r1, r3 ! 78 EX | ||
410 | mov r4, r2 ! 5 MT (latency=0) | ||
411 | |||
412 | cmp/eq r3, r0 ! 54 MT | ||
413 | add #0x1f, r2 ! 50 EX | ||
414 | |||
415 | bt/s 1f ! 110 BR | ||
416 | and r1, r2 ! 78 EX | ||
417 | |||
418 | ! copy initial words until cache line aligned | ||
419 | |||
420 | mov.l @(r0, r5), r1 ! 21 LS (latency=2) | ||
421 | tst #4, r0 ! 87 MT | ||
422 | |||
423 | mov r5, r6 ! 5 MT (latency=0) | ||
424 | add #-4, r6 ! 50 EX | ||
425 | |||
426 | bt/s 4f ! 111 BR | ||
427 | add #8, r3 ! 50 EX | ||
428 | |||
429 | tst #0x18, r0 ! 87 MT | ||
430 | |||
431 | bt/s 1f ! 109 BR | ||
432 | mov.l r1,@-r0 ! 30 LS | ||
433 | |||
434 | ! 4 cycles, 2 long words per iteration | ||
435 | 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) | ||
436 | |||
437 | 4: mov.l @(r0, r6), r7 ! 21 LS (latency=2) | ||
438 | cmp/eq r3, r0 ! 54 MT | ||
439 | |||
440 | mov.l r1, @-r0 ! 30 LS | ||
441 | bf/s 3b ! 109 BR | ||
442 | |||
443 | mov.l r7, @-r0 ! 30 LS | ||
444 | |||
445 | ! Copy the cache line aligned blocks | ||
446 | ! | ||
447 | ! In use: r0, r2, r4, r5 | ||
448 | ! Scratch: r1, r3, r6, r7 | ||
449 | ! | ||
450 | ! We could do this with the four scratch registers, but if src | ||
451 | ! and dest hit the same cache line, this will thrash, so make | ||
452 | ! use of additional registers. | ||
453 | ! | ||
454 | ! We also need r0 as a temporary (for movca), so 'undo' the invariant: | ||
455 | ! r5: src (was r0+r5) | ||
456 | ! r1: dest (was r0) | ||
457 | ! this can be reversed at the end, so we don't need to save any extra | ||
458 | ! state. | ||
459 | ! | ||
460 | 1: mov.l r8, @-r15 ! 30 LS | ||
461 | add r0, r5 ! 49 EX | ||
462 | |||
463 | mov.l r9, @-r15 ! 30 LS | ||
464 | mov r0, r1 ! 5 MT (latency=0) | ||
465 | |||
466 | mov.l r10, @-r15 ! 30 LS | ||
467 | add #-0x1c, r5 ! 50 EX | ||
468 | |||
469 | mov.l r11, @-r15 ! 30 LS | ||
470 | |||
471 | ! 16 cycles, 32 bytes per iteration | ||
472 | 2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2) | ||
473 | add #-0x20, r1 ! 50 EX | ||
474 | mov.l @(0x04,r5),r3 ! 18 LS (latency=2) | ||
475 | mov.l @(0x08,r5),r6 ! 18 LS (latency=2) | ||
476 | mov.l @(0x0c,r5),r7 ! 18 LS (latency=2) | ||
477 | mov.l @(0x10,r5),r8 ! 18 LS (latency=2) | ||
478 | mov.l @(0x14,r5),r9 ! 18 LS (latency=2) | ||
479 | mov.l @(0x18,r5),r10 ! 18 LS (latency=2) | ||
480 | mov.l @(0x1c,r5),r11 ! 18 LS (latency=2) | ||
481 | movca.l r0,@r1 ! 40 LS (latency=3-7) | ||
482 | mov.l r3,@(0x04,r1) ! 33 LS | ||
483 | mov.l r6,@(0x08,r1) ! 33 LS | ||
484 | mov.l r7,@(0x0c,r1) ! 33 LS | ||
485 | |||
486 | mov.l r8,@(0x10,r1) ! 33 LS | ||
487 | add #-0x20, r5 ! 50 EX | ||
488 | |||
489 | mov.l r9,@(0x14,r1) ! 33 LS | ||
490 | cmp/eq r2,r1 ! 54 MT | ||
491 | |||
492 | mov.l r10,@(0x18,r1) ! 33 LS | ||
493 | bf/s 2b ! 109 BR | ||
494 | |||
495 | mov.l r11,@(0x1c,r1) ! 33 LS | ||
496 | |||
497 | mov r1, r0 ! 5 MT (latency=0) | ||
498 | |||
499 | mov.l @r15+, r11 ! 15 LS | ||
500 | sub r1, r5 ! 75 EX | ||
501 | |||
502 | mov.l @r15+, r10 ! 15 LS | ||
503 | cmp/eq r4, r0 ! 54 MT | ||
504 | |||
505 | bf/s 1f ! 109 BR | ||
506 | mov.l @r15+, r9 ! 15 LS | ||
507 | |||
508 | rts | ||
509 | 1: mov.l @r15+, r8 ! 15 LS | ||
510 | sub r4, r1 ! 75 EX (len remaining) | ||
511 | |||
512 | ! number of trailing bytes is non-zero | ||
513 | ! | ||
514 | ! invariants restored (r5 already decremented by 4) | ||
515 | ! also r1=num bytes remaining | ||
516 | |||
517 | mov #4, r2 ! 6 EX | ||
518 | mov r4, r7 ! 5 MT (latency=0) | ||
519 | |||
520 | add #0x1c, r5 ! 50 EX (back to -4) | ||
521 | cmp/hs r2, r1 ! 58 MT | ||
522 | |||
523 | bf/s 5f ! 108 BR | ||
524 | add #11, r7 ! 50 EX | ||
525 | |||
526 | mov.l @(r0, r5), r6 ! 21 LS (latency=2) | ||
527 | tst r2, r1 ! 86 MT | ||
528 | |||
529 | mov r5, r3 ! 5 MT (latency=0) | ||
530 | bt/s 4f ! 111 BR | ||
531 | |||
532 | add #-4, r3 ! 50 EX | ||
533 | cmp/hs r2, r1 ! 58 MT | ||
534 | |||
535 | bt/s 5f ! 111 BR | ||
536 | mov.l r6,@-r0 ! 30 LS | ||
537 | |||
538 | ! 4 cycles, 2 long words per iteration | ||
539 | 3: mov.l @(r0, r5), r6 ! 21 LS (latency=2) | ||
540 | |||
541 | 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) | ||
542 | cmp/hi r7, r0 | ||
543 | |||
544 | mov.l r6, @-r0 ! 30 LS | ||
545 | bt/s 3b ! 109 BR | ||
546 | |||
547 | mov.l r2, @-r0 ! 30 LS | ||
548 | |||
549 | ! Copy the final 0-3 bytes | ||
550 | |||
551 | 5: cmp/eq r0, r4 ! 54 MT | ||
552 | add #-10, r7 ! 50 EX | ||
553 | |||
554 | bt 9f ! 110 BR | ||
555 | add #3,r5 ! 50 EX | ||
556 | |||
557 | ! 3 cycles, 1 byte per iteration | ||
558 | 1: mov.b @(r0,r5),r1 ! 19 LS | ||
559 | cmp/hi r7,r0 ! 57 MT | ||
560 | |||
561 | bt/s 1b ! 111 BR | ||
562 | mov.b r1,@-r0 ! 28 LS | ||
563 | |||
564 | 9: rts | ||
565 | nop | ||
566 | |||
567 | ! | ||
568 | ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. | ||
569 | ! | ||
570 | |||
571 | .balign 32 | ||
572 | .Lcase2: | ||
573 | ! Size is 16 or greater and less then 64, but may have trailing bytes | ||
574 | |||
575 | 2: mov r5, r6 ! 5 MT (latency=0) | ||
576 | add #-2,r5 ! 50 EX | ||
577 | |||
578 | mov r4,r2 ! 5 MT (latency=0) | ||
579 | add #-4,r6 ! 50 EX | ||
580 | |||
581 | add #7,r2 ! 50 EX | ||
582 | 3: mov.w @(r0,r5),r1 ! 20 LS (latency=2) | ||
583 | |||
584 | mov.w @(r0,r6),r3 ! 20 LS (latency=2) | ||
585 | cmp/hi r2,r0 ! 57 MT | ||
586 | |||
587 | mov.w r1,@-r0 ! 29 LS | ||
588 | bt/s 3b ! 111 BR | ||
589 | |||
590 | mov.w r3,@-r0 ! 29 LS | ||
591 | |||
592 | bra 10f | ||
593 | nop | ||
594 | |||
595 | |||
596 | .balign 32 | ||
597 | .Lcase2b: | ||
598 | ! Size is at least 64 bytes, so will be going round the big loop at least once. | ||
599 | ! | ||
600 | ! r2 = rounded up r4 | ||
601 | ! r3 = rounded down r0 | ||
602 | |||
603 | mov r0, r3 ! 5 MT (latency=0) | ||
604 | mov #(~0x1f), r1 ! 6 EX | ||
605 | |||
606 | and r1, r3 ! 78 EX | ||
607 | mov r4, r2 ! 5 MT (latency=0) | ||
608 | |||
609 | cmp/eq r3, r0 ! 54 MT | ||
610 | add #0x1f, r2 ! 50 EX | ||
611 | |||
612 | add #-2, r5 ! 50 EX | ||
613 | bt/s 1f ! 110 BR | ||
614 | and r1, r2 ! 78 EX | ||
615 | |||
616 | ! Copy a short word one at a time until we are cache line aligned | ||
617 | ! Normal values: r0, r2, r3, r4 | ||
618 | ! Unused: r1, r6, r7 | ||
619 | ! Mod: r5 (=r5-2) | ||
620 | ! | ||
621 | add #2, r3 ! 50 EX | ||
622 | |||
623 | 2: mov.w @(r0,r5),r1 ! 20 LS (latency=2) | ||
624 | cmp/eq r3,r0 ! 54 MT | ||
625 | |||
626 | bf/s 2b ! 111 BR | ||
627 | |||
628 | mov.w r1,@-r0 ! 29 LS | ||
629 | |||
630 | ! Copy the cache line aligned blocks | ||
631 | ! | ||
632 | ! In use: r0, r2, r4, r5 (=r5-2) | ||
633 | ! Scratch: r1, r3, r6, r7 | ||
634 | ! | ||
635 | ! We could do this with the four scratch registers, but if src | ||
636 | ! and dest hit the same cache line, this will thrash, so make | ||
637 | ! use of additional registers. | ||
638 | ! | ||
639 | ! We also need r0 as a temporary (for movca), so 'undo' the invariant: | ||
640 | ! r5: src (was r0+r5) | ||
641 | ! r1: dest (was r0) | ||
642 | ! this can be reversed at the end, so we don't need to save any extra | ||
643 | ! state. | ||
644 | ! | ||
645 | 1: mov.l r8, @-r15 ! 30 LS | ||
646 | add r0, r5 ! 49 EX | ||
647 | |||
648 | mov.l r9, @-r15 ! 30 LS | ||
649 | mov r0, r1 ! 5 MT (latency=0) | ||
650 | |||
651 | mov.l r10, @-r15 ! 30 LS | ||
652 | add #-0x1e, r5 ! 50 EX | ||
653 | |||
654 | mov.l r11, @-r15 ! 30 LS | ||
655 | |||
656 | mov.l r12, @-r15 ! 30 LS | ||
657 | |||
658 | ! 17 cycles, 32 bytes per iteration | ||
659 | #ifdef CONFIG_CPU_LITTLE_ENDIAN | ||
660 | 2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI | ||
661 | add #-0x20, r1 ! 50 EX | ||
662 | |||
663 | mov.l @r5+, r3 ! 15 LS (latency=2) NMLK | ||
664 | |||
665 | mov.l @r5+, r6 ! 15 LS (latency=2) RQPO | ||
666 | shll16 r0 ! 103 EX JI.. | ||
667 | |||
668 | mov.l @r5+, r7 ! 15 LS (latency=2) | ||
669 | xtrct r3, r0 ! 48 EX LKJI | ||
670 | |||
671 | mov.l @r5+, r8 ! 15 LS (latency=2) | ||
672 | xtrct r6, r3 ! 48 EX PONM | ||
673 | |||
674 | mov.l @r5+, r9 ! 15 LS (latency=2) | ||
675 | xtrct r7, r6 ! 48 EX | ||
676 | |||
677 | mov.l @r5+, r10 ! 15 LS (latency=2) | ||
678 | xtrct r8, r7 ! 48 EX | ||
679 | |||
680 | mov.l @r5+, r11 ! 15 LS (latency=2) | ||
681 | xtrct r9, r8 ! 48 EX | ||
682 | |||
683 | mov.w @r5+, r12 ! 15 LS (latency=2) | ||
684 | xtrct r10, r9 ! 48 EX | ||
685 | |||
686 | movca.l r0,@r1 ! 40 LS (latency=3-7) | ||
687 | xtrct r11, r10 ! 48 EX | ||
688 | |||
689 | mov.l r3, @(0x04,r1) ! 33 LS | ||
690 | xtrct r12, r11 ! 48 EX | ||
691 | |||
692 | mov.l r6, @(0x08,r1) ! 33 LS | ||
693 | |||
694 | mov.l r7, @(0x0c,r1) ! 33 LS | ||
695 | |||
696 | mov.l r8, @(0x10,r1) ! 33 LS | ||
697 | add #-0x40, r5 ! 50 EX | ||
698 | |||
699 | mov.l r9, @(0x14,r1) ! 33 LS | ||
700 | cmp/eq r2,r1 ! 54 MT | ||
701 | |||
702 | mov.l r10, @(0x18,r1) ! 33 LS | ||
703 | bf/s 2b ! 109 BR | ||
704 | |||
705 | mov.l r11, @(0x1c,r1) ! 33 LS | ||
706 | #else | ||
707 | 2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2) | ||
708 | add #-2, r5 ! 50 EX | ||
709 | |||
710 | mov.l @(0x1c,r5), r3 ! 18 LS (latency=2) | ||
711 | add #-4, r1 ! 50 EX | ||
712 | |||
713 | mov.l @(0x18,r5), r6 ! 18 LS (latency=2) | ||
714 | shll16 r0 ! 103 EX | ||
715 | |||
716 | mov.l @(0x14,r5), r7 ! 18 LS (latency=2) | ||
717 | xtrct r3, r0 ! 48 EX | ||
718 | |||
719 | mov.l @(0x10,r5), r8 ! 18 LS (latency=2) | ||
720 | xtrct r6, r3 ! 48 EX | ||
721 | |||
722 | mov.l @(0x0c,r5), r9 ! 18 LS (latency=2) | ||
723 | xtrct r7, r6 ! 48 EX | ||
724 | |||
725 | mov.l @(0x08,r5), r10 ! 18 LS (latency=2) | ||
726 | xtrct r8, r7 ! 48 EX | ||
727 | |||
728 | mov.l @(0x04,r5), r11 ! 18 LS (latency=2) | ||
729 | xtrct r9, r8 ! 48 EX | ||
730 | |||
731 | mov.w @(0x02,r5), r12 ! 18 LS (latency=2) | ||
732 | xtrct r10, r9 ! 48 EX | ||
733 | |||
734 | movca.l r0,@r1 ! 40 LS (latency=3-7) | ||
735 | add #-0x1c, r1 ! 50 EX | ||
736 | |||
737 | mov.l r3, @(0x1c,r1) ! 33 LS | ||
738 | xtrct r11, r10 ! 48 EX | ||
739 | |||
740 | mov.l r6, @(0x18,r1) ! 33 LS | ||
741 | xtrct r12, r11 ! 48 EX | ||
742 | |||
743 | mov.l r7, @(0x14,r1) ! 33 LS | ||
744 | |||
745 | mov.l r8, @(0x10,r1) ! 33 LS | ||
746 | add #-0x3e, r5 ! 50 EX | ||
747 | |||
748 | mov.l r9, @(0x0c,r1) ! 33 LS | ||
749 | cmp/eq r2,r1 ! 54 MT | ||
750 | |||
751 | mov.l r10, @(0x08,r1) ! 33 LS | ||
752 | bf/s 2b ! 109 BR | ||
753 | |||
754 | mov.l r11, @(0x04,r1) ! 33 LS | ||
755 | #endif | ||
756 | |||
757 | mov.l @r15+, r12 | ||
758 | mov r1, r0 ! 5 MT (latency=0) | ||
759 | |||
760 | mov.l @r15+, r11 ! 15 LS | ||
761 | sub r1, r5 ! 75 EX | ||
762 | |||
763 | mov.l @r15+, r10 ! 15 LS | ||
764 | cmp/eq r4, r0 ! 54 MT | ||
765 | |||
766 | bf/s 1f ! 109 BR | ||
767 | mov.l @r15+, r9 ! 15 LS | ||
768 | |||
769 | rts | ||
770 | 1: mov.l @r15+, r8 ! 15 LS | ||
771 | |||
772 | add #0x1e, r5 ! 50 EX | ||
773 | |||
774 | ! Finish off a short word at a time | ||
775 | ! r5 must be invariant - 2 | ||
776 | 10: mov r4,r2 ! 5 MT (latency=0) | ||
777 | add #1,r2 ! 50 EX | ||
778 | |||
779 | cmp/hi r2, r0 ! 57 MT | ||
780 | bf/s 1f ! 109 BR | ||
781 | |||
782 | add #2, r2 ! 50 EX | ||
783 | |||
784 | 3: mov.w @(r0,r5),r1 ! 20 LS | ||
785 | cmp/hi r2,r0 ! 57 MT | ||
786 | |||
787 | bt/s 3b ! 109 BR | ||
788 | |||
789 | mov.w r1,@-r0 ! 29 LS | ||
790 | 1: | ||
791 | |||
792 | ! | ||
793 | ! Finally, copy the last byte if necessary | ||
794 | cmp/eq r4,r0 ! 54 MT | ||
795 | bt/s 9b | ||
796 | add #1,r5 | ||
797 | mov.b @(r0,r5),r1 | ||
798 | rts | ||
799 | mov.b r1,@-r0 | ||
800 | |||
diff --git a/arch/sh/lib/memcpy.S b/arch/sh/lib/memcpy.S new file mode 100644 index 000000000000..232fab34c261 --- /dev/null +++ b/arch/sh/lib/memcpy.S | |||
@@ -0,0 +1,227 @@ | |||
1 | /* $Id: memcpy.S,v 1.3 2001/07/27 11:50:52 gniibe Exp $ | ||
2 | * | ||
3 | * "memcpy" implementation of SuperH | ||
4 | * | ||
5 | * Copyright (C) 1999 Niibe Yutaka | ||
6 | * | ||
7 | */ | ||
8 | |||
9 | /* | ||
10 | * void *memcpy(void *dst, const void *src, size_t n); | ||
11 | * No overlap between the memory of DST and of SRC are assumed. | ||
12 | */ | ||
13 | |||
14 | #include <linux/linkage.h> | ||
15 | ENTRY(memcpy) | ||
16 | tst r6,r6 | ||
17 | bt/s 9f ! if n=0, do nothing | ||
18 | mov r4,r0 | ||
19 | sub r4,r5 ! From here, r5 has the distance to r0 | ||
20 | add r6,r0 ! From here, r0 points the end of copying point | ||
21 | mov #12,r1 | ||
22 | cmp/gt r6,r1 | ||
23 | bt/s 7f ! if it's too small, copy a byte at once | ||
24 | add #-1,r5 | ||
25 | add #1,r5 | ||
26 | ! From here, r6 is free | ||
27 | ! | ||
28 | ! r4 --> [ ... ] DST [ ... ] SRC | ||
29 | ! [ ... ] [ ... ] | ||
30 | ! : : | ||
31 | ! r0 --> [ ... ] r0+r5 --> [ ... ] | ||
32 | ! | ||
33 | ! | ||
34 | mov r5,r1 | ||
35 | mov #3,r2 | ||
36 | and r2,r1 | ||
37 | shll2 r1 | ||
38 | mov r0,r3 ! Save the value on R0 to R3 | ||
39 | mova jmptable,r0 | ||
40 | add r1,r0 | ||
41 | mov.l @r0,r1 | ||
42 | jmp @r1 | ||
43 | mov r3,r0 ! and back to R0 | ||
44 | .balign 4 | ||
45 | jmptable: | ||
46 | .long case0 | ||
47 | .long case1 | ||
48 | .long case2 | ||
49 | .long case3 | ||
50 | |||
51 | ! copy a byte at once | ||
52 | 7: mov r4,r2 | ||
53 | add #1,r2 | ||
54 | 8: | ||
55 | cmp/hi r2,r0 | ||
56 | mov.b @(r0,r5),r1 | ||
57 | bt/s 8b ! while (r0>r2) | ||
58 | mov.b r1,@-r0 | ||
59 | 9: | ||
60 | rts | ||
61 | nop | ||
62 | |||
63 | case0: | ||
64 | ! | ||
65 | ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR | ||
66 | ! | ||
67 | ! First, align to long word boundary | ||
68 | mov r0,r3 | ||
69 | and r2,r3 | ||
70 | tst r3,r3 | ||
71 | bt/s 2f | ||
72 | add #-4,r5 | ||
73 | add #3,r5 | ||
74 | 1: dt r3 | ||
75 | mov.b @(r0,r5),r1 | ||
76 | bf/s 1b | ||
77 | mov.b r1,@-r0 | ||
78 | ! | ||
79 | add #-3,r5 | ||
80 | 2: ! Second, copy a long word at once | ||
81 | mov r4,r2 | ||
82 | add #7,r2 | ||
83 | 3: mov.l @(r0,r5),r1 | ||
84 | cmp/hi r2,r0 | ||
85 | bt/s 3b | ||
86 | mov.l r1,@-r0 | ||
87 | ! | ||
88 | ! Third, copy a byte at once, if necessary | ||
89 | cmp/eq r4,r0 | ||
90 | bt/s 9b | ||
91 | add #3,r5 | ||
92 | bra 8b | ||
93 | add #-6,r2 | ||
94 | |||
95 | case1: | ||
96 | ! | ||
97 | ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. | ||
98 | ! | ||
99 | ! First, align to long word boundary | ||
100 | mov r0,r3 | ||
101 | and r2,r3 | ||
102 | tst r3,r3 | ||
103 | bt/s 2f | ||
104 | add #-1,r5 | ||
105 | 1: dt r3 | ||
106 | mov.b @(r0,r5),r1 | ||
107 | bf/s 1b | ||
108 | mov.b r1,@-r0 | ||
109 | ! | ||
110 | 2: ! Second, read a long word and write a long word at once | ||
111 | mov.l @(r0,r5),r1 | ||
112 | add #-4,r5 | ||
113 | mov r4,r2 | ||
114 | add #7,r2 | ||
115 | ! | ||
116 | #ifdef __LITTLE_ENDIAN__ | ||
117 | 3: mov r1,r3 ! RQPO | ||
118 | shll16 r3 | ||
119 | shll8 r3 ! Oxxx | ||
120 | mov.l @(r0,r5),r1 ! NMLK | ||
121 | mov r1,r6 | ||
122 | shlr8 r6 ! xNML | ||
123 | or r6,r3 ! ONML | ||
124 | cmp/hi r2,r0 | ||
125 | bt/s 3b | ||
126 | mov.l r3,@-r0 | ||
127 | #else | ||
128 | 3: mov r1,r3 ! OPQR | ||
129 | shlr16 r3 | ||
130 | shlr8 r3 ! xxxO | ||
131 | mov.l @(r0,r5),r1 ! KLMN | ||
132 | mov r1,r6 | ||
133 | shll8 r6 ! LMNx | ||
134 | or r6,r3 ! LMNO | ||
135 | cmp/hi r2,r0 | ||
136 | bt/s 3b | ||
137 | mov.l r3,@-r0 | ||
138 | #endif | ||
139 | ! | ||
140 | ! Third, copy a byte at once, if necessary | ||
141 | cmp/eq r4,r0 | ||
142 | bt/s 9b | ||
143 | add #4,r5 | ||
144 | bra 8b | ||
145 | add #-6,r2 | ||
146 | |||
147 | case2: | ||
148 | ! | ||
149 | ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. | ||
150 | ! | ||
151 | ! First, align to word boundary | ||
152 | tst #1,r0 | ||
153 | bt/s 2f | ||
154 | add #-1,r5 | ||
155 | mov.b @(r0,r5),r1 | ||
156 | mov.b r1,@-r0 | ||
157 | ! | ||
158 | 2: ! Second, read a word and write a word at once | ||
159 | add #-1,r5 | ||
160 | mov r4,r2 | ||
161 | add #3,r2 | ||
162 | ! | ||
163 | 3: mov.w @(r0,r5),r1 | ||
164 | cmp/hi r2,r0 | ||
165 | bt/s 3b | ||
166 | mov.w r1,@-r0 | ||
167 | ! | ||
168 | ! Third, copy a byte at once, if necessary | ||
169 | cmp/eq r4,r0 | ||
170 | bt/s 9b | ||
171 | add #1,r5 | ||
172 | mov.b @(r0,r5),r1 | ||
173 | rts | ||
174 | mov.b r1,@-r0 | ||
175 | |||
176 | case3: | ||
177 | ! | ||
178 | ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... | ||
179 | ! | ||
180 | ! First, align to long word boundary | ||
181 | mov r0,r3 | ||
182 | and r2,r3 | ||
183 | tst r3,r3 | ||
184 | bt/s 2f | ||
185 | add #-1,r5 | ||
186 | 1: dt r3 | ||
187 | mov.b @(r0,r5),r1 | ||
188 | bf/s 1b | ||
189 | mov.b r1,@-r0 | ||
190 | ! | ||
191 | 2: ! Second, read a long word and write a long word at once | ||
192 | add #-2,r5 | ||
193 | mov.l @(r0,r5),r1 | ||
194 | add #-4,r5 | ||
195 | mov r4,r2 | ||
196 | add #7,r2 | ||
197 | ! | ||
198 | #ifdef __LITTLE_ENDIAN__ | ||
199 | 3: mov r1,r3 ! RQPO | ||
200 | shll8 r3 ! QPOx | ||
201 | mov.l @(r0,r5),r1 ! NMLK | ||
202 | mov r1,r6 | ||
203 | shlr16 r6 | ||
204 | shlr8 r6 ! xxxN | ||
205 | or r6,r3 ! QPON | ||
206 | cmp/hi r2,r0 | ||
207 | bt/s 3b | ||
208 | mov.l r3,@-r0 | ||
209 | #else | ||
210 | 3: mov r1,r3 ! OPQR | ||
211 | shlr8 r3 ! xOPQ | ||
212 | mov.l @(r0,r5),r1 ! KLMN | ||
213 | mov r1,r6 | ||
214 | shll16 r6 | ||
215 | shll8 r6 ! Nxxx | ||
216 | or r6,r3 ! NOPQ | ||
217 | cmp/hi r2,r0 | ||
218 | bt/s 3b | ||
219 | mov.l r3,@-r0 | ||
220 | #endif | ||
221 | ! | ||
222 | ! Third, copy a byte at once, if necessary | ||
223 | cmp/eq r4,r0 | ||
224 | bt/s 9b | ||
225 | add #6,r5 | ||
226 | bra 8b | ||
227 | add #-6,r2 | ||
diff --git a/arch/sh/lib/memmove.S b/arch/sh/lib/memmove.S new file mode 100644 index 000000000000..5a2211f09202 --- /dev/null +++ b/arch/sh/lib/memmove.S | |||
@@ -0,0 +1,254 @@ | |||
1 | /* $Id: memmove.S,v 1.2 2001/07/27 11:51:09 gniibe Exp $ | ||
2 | * | ||
3 | * "memmove" implementation of SuperH | ||
4 | * | ||
5 | * Copyright (C) 1999 Niibe Yutaka | ||
6 | * | ||
7 | */ | ||
8 | |||
9 | /* | ||
10 | * void *memmove(void *dst, const void *src, size_t n); | ||
11 | * The memory areas may overlap. | ||
12 | */ | ||
13 | |||
14 | #include <linux/linkage.h> | ||
15 | ENTRY(memmove) | ||
16 | ! if dest > src, call memcpy (it copies in decreasing order) | ||
17 | cmp/hi r5,r4 | ||
18 | bf 1f | ||
19 | mov.l 2f,r0 | ||
20 | jmp @r0 | ||
21 | nop | ||
22 | .balign 4 | ||
23 | 2: .long memcpy | ||
24 | 1: | ||
25 | sub r5,r4 ! From here, r4 has the distance to r0 | ||
26 | tst r6,r6 | ||
27 | bt/s 9f ! if n=0, do nothing | ||
28 | mov r5,r0 | ||
29 | add r6,r5 | ||
30 | mov #12,r1 | ||
31 | cmp/gt r6,r1 | ||
32 | bt/s 8f ! if it's too small, copy a byte at once | ||
33 | add #-1,r4 | ||
34 | add #1,r4 | ||
35 | ! | ||
36 | ! [ ... ] DST [ ... ] SRC | ||
37 | ! [ ... ] [ ... ] | ||
38 | ! : : | ||
39 | ! r0+r4--> [ ... ] r0 --> [ ... ] | ||
40 | ! : : | ||
41 | ! [ ... ] [ ... ] | ||
42 | ! r5 --> | ||
43 | ! | ||
44 | mov r4,r1 | ||
45 | mov #3,r2 | ||
46 | and r2,r1 | ||
47 | shll2 r1 | ||
48 | mov r0,r3 ! Save the value on R0 to R3 | ||
49 | mova jmptable,r0 | ||
50 | add r1,r0 | ||
51 | mov.l @r0,r1 | ||
52 | jmp @r1 | ||
53 | mov r3,r0 ! and back to R0 | ||
54 | .balign 4 | ||
55 | jmptable: | ||
56 | .long case0 | ||
57 | .long case1 | ||
58 | .long case2 | ||
59 | .long case3 | ||
60 | |||
61 | ! copy a byte at once | ||
62 | 8: mov.b @r0+,r1 | ||
63 | cmp/hs r5,r0 | ||
64 | bf/s 8b ! while (r0<r5) | ||
65 | mov.b r1,@(r0,r4) | ||
66 | add #1,r4 | ||
67 | 9: | ||
68 | add r4,r0 | ||
69 | rts | ||
70 | sub r6,r0 | ||
71 | |||
72 | case_none: | ||
73 | bra 8b | ||
74 | add #-1,r4 | ||
75 | |||
76 | case0: | ||
77 | ! | ||
78 | ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR | ||
79 | ! | ||
80 | ! First, align to long word boundary | ||
81 | mov r0,r3 | ||
82 | and r2,r3 | ||
83 | tst r3,r3 | ||
84 | bt/s 2f | ||
85 | add #-1,r4 | ||
86 | mov #4,r2 | ||
87 | sub r3,r2 | ||
88 | 1: dt r2 | ||
89 | mov.b @r0+,r1 | ||
90 | bf/s 1b | ||
91 | mov.b r1,@(r0,r4) | ||
92 | ! | ||
93 | 2: ! Second, copy a long word at once | ||
94 | add #-3,r4 | ||
95 | add #-3,r5 | ||
96 | 3: mov.l @r0+,r1 | ||
97 | cmp/hs r5,r0 | ||
98 | bf/s 3b | ||
99 | mov.l r1,@(r0,r4) | ||
100 | add #3,r5 | ||
101 | ! | ||
102 | ! Third, copy a byte at once, if necessary | ||
103 | cmp/eq r5,r0 | ||
104 | bt/s 9b | ||
105 | add #4,r4 | ||
106 | bra 8b | ||
107 | add #-1,r4 | ||
108 | |||
109 | case3: | ||
110 | ! | ||
111 | ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. | ||
112 | ! | ||
113 | ! First, align to long word boundary | ||
114 | mov r0,r3 | ||
115 | and r2,r3 | ||
116 | tst r3,r3 | ||
117 | bt/s 2f | ||
118 | add #-1,r4 | ||
119 | mov #4,r2 | ||
120 | sub r3,r2 | ||
121 | 1: dt r2 | ||
122 | mov.b @r0+,r1 | ||
123 | bf/s 1b | ||
124 | mov.b r1,@(r0,r4) | ||
125 | ! | ||
126 | 2: ! Second, read a long word and write a long word at once | ||
127 | add #-2,r4 | ||
128 | mov.l @(r0,r4),r1 | ||
129 | add #-7,r5 | ||
130 | add #-4,r4 | ||
131 | ! | ||
132 | #ifdef __LITTLE_ENDIAN__ | ||
133 | shll8 r1 | ||
134 | 3: mov r1,r3 ! JIHG | ||
135 | shlr8 r3 ! xJIH | ||
136 | mov.l @r0+,r1 ! NMLK | ||
137 | mov r1,r2 | ||
138 | shll16 r2 | ||
139 | shll8 r2 ! Kxxx | ||
140 | or r2,r3 ! KJIH | ||
141 | cmp/hs r5,r0 | ||
142 | bf/s 3b | ||
143 | mov.l r3,@(r0,r4) | ||
144 | #else | ||
145 | shlr8 r1 | ||
146 | 3: mov r1,r3 ! GHIJ | ||
147 | shll8 r3 ! HIJx | ||
148 | mov.l @r0+,r1 ! KLMN | ||
149 | mov r1,r2 | ||
150 | shlr16 r2 | ||
151 | shlr8 r2 ! xxxK | ||
152 | or r2,r3 ! HIJK | ||
153 | cmp/hs r5,r0 | ||
154 | bf/s 3b | ||
155 | mov.l r3,@(r0,r4) | ||
156 | #endif | ||
157 | add #7,r5 | ||
158 | ! | ||
159 | ! Third, copy a byte at once, if necessary | ||
160 | cmp/eq r5,r0 | ||
161 | bt/s 9b | ||
162 | add #7,r4 | ||
163 | add #-3,r0 | ||
164 | bra 8b | ||
165 | add #-1,r4 | ||
166 | |||
167 | case2: | ||
168 | ! | ||
169 | ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. | ||
170 | ! | ||
171 | ! First, align to word boundary | ||
172 | tst #1,r0 | ||
173 | bt/s 2f | ||
174 | add #-1,r4 | ||
175 | mov.b @r0+,r1 | ||
176 | mov.b r1,@(r0,r4) | ||
177 | ! | ||
178 | 2: ! Second, read a word and write a word at once | ||
179 | add #-1,r4 | ||
180 | add #-1,r5 | ||
181 | ! | ||
182 | 3: mov.w @r0+,r1 | ||
183 | cmp/hs r5,r0 | ||
184 | bf/s 3b | ||
185 | mov.w r1,@(r0,r4) | ||
186 | add #1,r5 | ||
187 | ! | ||
188 | ! Third, copy a byte at once, if necessary | ||
189 | cmp/eq r5,r0 | ||
190 | bt/s 9b | ||
191 | add #2,r4 | ||
192 | mov.b @r0,r1 | ||
193 | mov.b r1,@(r0,r4) | ||
194 | bra 9b | ||
195 | add #1,r0 | ||
196 | |||
197 | case1: | ||
198 | ! | ||
199 | ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... | ||
200 | ! | ||
201 | ! First, align to long word boundary | ||
202 | mov r0,r3 | ||
203 | and r2,r3 | ||
204 | tst r3,r3 | ||
205 | bt/s 2f | ||
206 | add #-1,r4 | ||
207 | mov #4,r2 | ||
208 | sub r3,r2 | ||
209 | 1: dt r2 | ||
210 | mov.b @r0+,r1 | ||
211 | bf/s 1b | ||
212 | mov.b r1,@(r0,r4) | ||
213 | ! | ||
214 | 2: ! Second, read a long word and write a long word at once | ||
215 | mov.l @(r0,r4),r1 | ||
216 | add #-7,r5 | ||
217 | add #-4,r4 | ||
218 | ! | ||
219 | #ifdef __LITTLE_ENDIAN__ | ||
220 | shll16 r1 | ||
221 | shll8 r1 | ||
222 | 3: mov r1,r3 ! JIHG | ||
223 | shlr16 r3 | ||
224 | shlr8 r3 ! xxxJ | ||
225 | mov.l @r0+,r1 ! NMLK | ||
226 | mov r1,r2 | ||
227 | shll8 r2 ! MLKx | ||
228 | or r2,r3 ! MLKJ | ||
229 | cmp/hs r5,r0 | ||
230 | bf/s 3b | ||
231 | mov.l r3,@(r0,r4) | ||
232 | #else | ||
233 | shlr16 r1 | ||
234 | shlr8 r1 | ||
235 | 3: mov r1,r3 ! GHIJ | ||
236 | shll16 r3 | ||
237 | shll8 r3 ! Jxxx | ||
238 | mov.l @r0+,r1 ! KLMN | ||
239 | mov r1,r2 | ||
240 | shlr8 r2 ! xKLM | ||
241 | or r2,r3 ! JKLM | ||
242 | cmp/hs r5,r0 | ||
243 | bf/s 3b ! while(r0<r5) | ||
244 | mov.l r3,@(r0,r4) | ||
245 | #endif | ||
246 | add #7,r5 | ||
247 | ! | ||
248 | ! Third, copy a byte at once, if necessary | ||
249 | cmp/eq r5,r0 | ||
250 | bt/s 9b | ||
251 | add #5,r4 | ||
252 | add #-3,r0 | ||
253 | bra 8b | ||
254 | add #-1,r4 | ||
diff --git a/arch/sh/lib/memset.S b/arch/sh/lib/memset.S new file mode 100644 index 000000000000..95670090680e --- /dev/null +++ b/arch/sh/lib/memset.S | |||
@@ -0,0 +1,57 @@ | |||
1 | /* $Id: memset.S,v 1.1 2000/04/14 16:49:01 mjd Exp $ | ||
2 | * | ||
3 | * "memset" implementation of SuperH | ||
4 | * | ||
5 | * Copyright (C) 1999 Niibe Yutaka | ||
6 | * | ||
7 | */ | ||
8 | |||
9 | /* | ||
10 | * void *memset(void *s, int c, size_t n); | ||
11 | */ | ||
12 | |||
13 | #include <linux/linkage.h> | ||
14 | |||
15 | ENTRY(memset) | ||
16 | tst r6,r6 | ||
17 | bt/s 5f ! if n=0, do nothing | ||
18 | add r6,r4 | ||
19 | mov #12,r0 | ||
20 | cmp/gt r6,r0 | ||
21 | bt/s 4f ! if it's too small, set a byte at once | ||
22 | mov r4,r0 | ||
23 | and #3,r0 | ||
24 | cmp/eq #0,r0 | ||
25 | bt/s 2f ! It's aligned | ||
26 | sub r0,r6 | ||
27 | 1: | ||
28 | dt r0 | ||
29 | bf/s 1b | ||
30 | mov.b r5,@-r4 | ||
31 | 2: ! make VVVV | ||
32 | swap.b r5,r0 ! V0 | ||
33 | or r0,r5 ! VV | ||
34 | swap.w r5,r0 ! VV00 | ||
35 | or r0,r5 ! VVVV | ||
36 | ! | ||
37 | mov r6,r0 | ||
38 | shlr2 r0 | ||
39 | shlr r0 ! r0 = r6 >> 3 | ||
40 | 3: | ||
41 | dt r0 | ||
42 | mov.l r5,@-r4 ! set 8-byte at once | ||
43 | bf/s 3b | ||
44 | mov.l r5,@-r4 | ||
45 | ! | ||
46 | mov #7,r0 | ||
47 | and r0,r6 | ||
48 | tst r6,r6 | ||
49 | bt 5f | ||
50 | ! fill bytes | ||
51 | 4: | ||
52 | dt r6 | ||
53 | bf/s 4b | ||
54 | mov.b r5,@-r4 | ||
55 | 5: | ||
56 | rts | ||
57 | mov r4,r0 | ||
diff --git a/arch/sh/lib/strcasecmp.c b/arch/sh/lib/strcasecmp.c new file mode 100644 index 000000000000..4e57a216feaf --- /dev/null +++ b/arch/sh/lib/strcasecmp.c | |||
@@ -0,0 +1,26 @@ | |||
1 | /* | ||
2 | * linux/arch/alpha/lib/strcasecmp.c | ||
3 | */ | ||
4 | |||
5 | #include <linux/string.h> | ||
6 | |||
7 | |||
8 | /* We handle nothing here except the C locale. Since this is used in | ||
9 | only one place, on strings known to contain only 7 bit ASCII, this | ||
10 | is ok. */ | ||
11 | |||
12 | int strcasecmp(const char *a, const char *b) | ||
13 | { | ||
14 | int ca, cb; | ||
15 | |||
16 | do { | ||
17 | ca = *a++ & 0xff; | ||
18 | cb = *b++ & 0xff; | ||
19 | if (ca >= 'A' && ca <= 'Z') | ||
20 | ca += 'a' - 'A'; | ||
21 | if (cb >= 'A' && cb <= 'Z') | ||
22 | cb += 'a' - 'A'; | ||
23 | } while (ca == cb && ca != '\0'); | ||
24 | |||
25 | return ca - cb; | ||
26 | } | ||
diff --git a/arch/sh/lib/strlen.S b/arch/sh/lib/strlen.S new file mode 100644 index 000000000000..f8ab296047b3 --- /dev/null +++ b/arch/sh/lib/strlen.S | |||
@@ -0,0 +1,70 @@ | |||
1 | /* $Id: strlen.S,v 1.2 2001/06/29 14:07:15 gniibe Exp $ | ||
2 | * | ||
3 | * "strlen" implementation of SuperH | ||
4 | * | ||
5 | * Copyright (C) 1999 Kaz Kojima | ||
6 | * | ||
7 | */ | ||
8 | |||
9 | /* size_t strlen (const char *s) */ | ||
10 | |||
11 | #include <linux/linkage.h> | ||
12 | ENTRY(strlen) | ||
13 | mov r4,r0 | ||
14 | and #3,r0 | ||
15 | tst r0,r0 | ||
16 | bt/s 1f | ||
17 | mov #0,r2 | ||
18 | |||
19 | add #-1,r0 | ||
20 | shll2 r0 | ||
21 | shll r0 | ||
22 | braf r0 | ||
23 | nop | ||
24 | |||
25 | mov.b @r4+,r1 | ||
26 | tst r1,r1 | ||
27 | bt 8f | ||
28 | add #1,r2 | ||
29 | |||
30 | mov.b @r4+,r1 | ||
31 | tst r1,r1 | ||
32 | bt 8f | ||
33 | add #1,r2 | ||
34 | |||
35 | mov.b @r4+,r1 | ||
36 | tst r1,r1 | ||
37 | bt 8f | ||
38 | add #1,r2 | ||
39 | |||
40 | 1: | ||
41 | mov #0,r3 | ||
42 | 2: | ||
43 | mov.l @r4+,r1 | ||
44 | cmp/str r3,r1 | ||
45 | bf/s 2b | ||
46 | add #4,r2 | ||
47 | |||
48 | add #-4,r2 | ||
49 | #ifndef __LITTLE_ENDIAN__ | ||
50 | swap.b r1,r1 | ||
51 | swap.w r1,r1 | ||
52 | swap.b r1,r1 | ||
53 | #endif | ||
54 | extu.b r1,r0 | ||
55 | tst r0,r0 | ||
56 | bt/s 8f | ||
57 | shlr8 r1 | ||
58 | add #1,r2 | ||
59 | extu.b r1,r0 | ||
60 | tst r0,r0 | ||
61 | bt/s 8f | ||
62 | shlr8 r1 | ||
63 | add #1,r2 | ||
64 | extu.b r1,r0 | ||
65 | tst r0,r0 | ||
66 | bt 8f | ||
67 | add #1,r2 | ||
68 | 8: | ||
69 | rts | ||
70 | mov r2,r0 | ||
diff --git a/arch/sh/lib/udivdi3.c b/arch/sh/lib/udivdi3.c new file mode 100644 index 000000000000..68f038bf3c50 --- /dev/null +++ b/arch/sh/lib/udivdi3.c | |||
@@ -0,0 +1,16 @@ | |||
1 | /* | ||
2 | * Simple __udivdi3 function which doesn't use FPU. | ||
3 | */ | ||
4 | |||
5 | #include <linux/types.h> | ||
6 | |||
7 | extern u64 __xdiv64_32(u64 n, u32 d); | ||
8 | extern void panic(const char * fmt, ...); | ||
9 | |||
10 | u64 __udivdi3(u64 n, u64 d) | ||
11 | { | ||
12 | if (d & ~0xffffffff) | ||
13 | panic("Need true 64-bit/64-bit division"); | ||
14 | return __xdiv64_32(n, (u32)d); | ||
15 | } | ||
16 | |||