aboutsummaryrefslogtreecommitdiffstats
path: root/arch/sh/lib
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/sh/lib
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'arch/sh/lib')
-rw-r--r--arch/sh/lib/Makefile13
-rw-r--r--arch/sh/lib/checksum.S385
-rw-r--r--arch/sh/lib/delay.c41
-rw-r--r--arch/sh/lib/div64-generic.c19
-rw-r--r--arch/sh/lib/div64.S46
-rw-r--r--arch/sh/lib/memchr.S26
-rw-r--r--arch/sh/lib/memcpy-sh4.S800
-rw-r--r--arch/sh/lib/memcpy.S227
-rw-r--r--arch/sh/lib/memmove.S254
-rw-r--r--arch/sh/lib/memset.S57
-rw-r--r--arch/sh/lib/strcasecmp.c26
-rw-r--r--arch/sh/lib/strlen.S70
-rw-r--r--arch/sh/lib/udivdi3.c16
13 files changed, 1980 insertions, 0 deletions
diff --git a/arch/sh/lib/Makefile b/arch/sh/lib/Makefile
new file mode 100644
index 000000000000..b5681e3f9684
--- /dev/null
+++ b/arch/sh/lib/Makefile
@@ -0,0 +1,13 @@
1#
2# Makefile for SuperH-specific library files..
3#
4
5lib-y = delay.o memset.o memmove.o memchr.o \
6 checksum.o strcasecmp.o strlen.o div64.o udivdi3.o \
7 div64-generic.o
8
9memcpy-y := memcpy.o
10memcpy-$(CONFIG_CPU_SH4) := memcpy-sh4.o
11
12lib-y += $(memcpy-y)
13
diff --git a/arch/sh/lib/checksum.S b/arch/sh/lib/checksum.S
new file mode 100644
index 000000000000..7c50dfe68c07
--- /dev/null
+++ b/arch/sh/lib/checksum.S
@@ -0,0 +1,385 @@
1/* $Id: checksum.S,v 1.10 2001/07/06 13:11:32 gniibe Exp $
2 *
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * IP/TCP/UDP checksumming routines
8 *
9 * Authors: Jorge Cwik, <jorge@laser.satlink.net>
10 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
11 * Tom May, <ftom@netcom.com>
12 * Pentium Pro/II routines:
13 * Alexander Kjeldaas <astor@guardian.no>
14 * Finn Arne Gangstad <finnag@guardian.no>
15 * Lots of code moved from tcp.c and ip.c; see those files
16 * for more names.
17 *
18 * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
19 * handling.
20 * Andi Kleen, add zeroing on error
21 * converted to pure assembler
22 *
23 * SuperH version: Copyright (C) 1999 Niibe Yutaka
24 *
25 * This program is free software; you can redistribute it and/or
26 * modify it under the terms of the GNU General Public License
27 * as published by the Free Software Foundation; either version
28 * 2 of the License, or (at your option) any later version.
29 */
30
31#include <asm/errno.h>
32#include <linux/linkage.h>
33
34/*
35 * computes a partial checksum, e.g. for TCP/UDP fragments
36 */
37
38/*
39 * unsigned int csum_partial(const unsigned char *buf, int len,
40 * unsigned int sum);
41 */
42
43.text
44ENTRY(csum_partial)
45 /*
46 * Experiments with Ethernet and SLIP connections show that buff
47 * is aligned on either a 2-byte or 4-byte boundary. We get at
48 * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
49 * Fortunately, it is easy to convert 2-byte alignment to 4-byte
50 * alignment for the unrolled loop.
51 */
52 mov r5, r1
53 mov r4, r0
54 tst #2, r0 ! Check alignment.
55 bt 2f ! Jump if alignment is ok.
56 !
57 add #-2, r5 ! Alignment uses up two bytes.
58 cmp/pz r5 !
59 bt/s 1f ! Jump if we had at least two bytes.
60 clrt
61 bra 6f
62 add #2, r5 ! r5 was < 2. Deal with it.
631:
64 mov r5, r1 ! Save new len for later use.
65 mov.w @r4+, r0
66 extu.w r0, r0
67 addc r0, r6
68 bf 2f
69 add #1, r6
702:
71 mov #-5, r0
72 shld r0, r5
73 tst r5, r5
74 bt/s 4f ! if it's =0, go to 4f
75 clrt
76 .align 2
773:
78 mov.l @r4+, r0
79 mov.l @r4+, r2
80 mov.l @r4+, r3
81 addc r0, r6
82 mov.l @r4+, r0
83 addc r2, r6
84 mov.l @r4+, r2
85 addc r3, r6
86 mov.l @r4+, r3
87 addc r0, r6
88 mov.l @r4+, r0
89 addc r2, r6
90 mov.l @r4+, r2
91 addc r3, r6
92 addc r0, r6
93 addc r2, r6
94 movt r0
95 dt r5
96 bf/s 3b
97 cmp/eq #1, r0
98 ! here, we know r5==0
99 addc r5, r6 ! add carry to r6
1004:
101 mov r1, r0
102 and #0x1c, r0
103 tst r0, r0
104 bt/s 6f
105 mov r0, r5
106 shlr2 r5
107 mov #0, r2
1085:
109 addc r2, r6
110 mov.l @r4+, r2
111 movt r0
112 dt r5
113 bf/s 5b
114 cmp/eq #1, r0
115 addc r2, r6
116 addc r5, r6 ! r5==0 here, so it means add carry-bit
1176:
118 mov r1, r5
119 mov #3, r0
120 and r0, r5
121 tst r5, r5
122 bt 9f ! if it's =0 go to 9f
123 mov #2, r1
124 cmp/hs r1, r5
125 bf 7f
126 mov.w @r4+, r0
127 extu.w r0, r0
128 cmp/eq r1, r5
129 bt/s 8f
130 clrt
131 shll16 r0
132 addc r0, r6
1337:
134 mov.b @r4+, r0
135 extu.b r0, r0
136#ifndef __LITTLE_ENDIAN__
137 shll8 r0
138#endif
1398:
140 addc r0, r6
141 mov #0, r0
142 addc r0, r6
1439:
144 rts
145 mov r6, r0
146
147/*
148unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
149 int sum, int *src_err_ptr, int *dst_err_ptr)
150 */
151
152/*
153 * Copy from ds while checksumming, otherwise like csum_partial
154 *
155 * The macros SRC and DST specify the type of access for the instruction.
156 * thus we can call a custom exception handler for all access types.
157 *
158 * FIXME: could someone double-check whether I haven't mixed up some SRC and
159 * DST definitions? It's damn hard to trigger all cases. I hope I got
160 * them all but there's no guarantee.
161 */
162
163#define SRC(...) \
164 9999: __VA_ARGS__ ; \
165 .section __ex_table, "a"; \
166 .long 9999b, 6001f ; \
167 .previous
168
169#define DST(...) \
170 9999: __VA_ARGS__ ; \
171 .section __ex_table, "a"; \
172 .long 9999b, 6002f ; \
173 .previous
174
175!
176! r4: const char *SRC
177! r5: char *DST
178! r6: int LEN
179! r7: int SUM
180!
181! on stack:
182! int *SRC_ERR_PTR
183! int *DST_ERR_PTR
184!
185ENTRY(csum_partial_copy_generic)
186 mov.l r5,@-r15
187 mov.l r6,@-r15
188
189 mov #3,r0 ! Check src and dest are equally aligned
190 mov r4,r1
191 and r0,r1
192 and r5,r0
193 cmp/eq r1,r0
194 bf 3f ! Different alignments, use slow version
195 tst #1,r0 ! Check dest word aligned
196 bf 3f ! If not, do it the slow way
197
198 mov #2,r0
199 tst r0,r5 ! Check dest alignment.
200 bt 2f ! Jump if alignment is ok.
201 add #-2,r6 ! Alignment uses up two bytes.
202 cmp/pz r6 ! Jump if we had at least two bytes.
203 bt/s 1f
204 clrt
205 bra 4f
206 add #2,r6 ! r6 was < 2. Deal with it.
207
2083: ! Handle different src and dest alignments.
209 ! This is not common, so simple byte by byte copy will do.
210 mov r6,r2
211 shlr r6
212 tst r6,r6
213 bt 4f
214 clrt
215 .align 2
2165:
217SRC( mov.b @r4+,r1 )
218SRC( mov.b @r4+,r0 )
219 extu.b r1,r1
220DST( mov.b r1,@r5 )
221DST( mov.b r0,@(1,r5) )
222 extu.b r0,r0
223 add #2,r5
224
225#ifdef __LITTLE_ENDIAN__
226 shll8 r0
227#else
228 shll8 r1
229#endif
230 or r1,r0
231
232 addc r0,r7
233 movt r0
234 dt r6
235 bf/s 5b
236 cmp/eq #1,r0
237 mov #0,r0
238 addc r0, r7
239
240 mov r2, r0
241 tst #1, r0
242 bt 7f
243 bra 5f
244 clrt
245
246 ! src and dest equally aligned, but to a two byte boundary.
247 ! Handle first two bytes as a special case
248 .align 2
2491:
250SRC( mov.w @r4+,r0 )
251DST( mov.w r0,@r5 )
252 add #2,r5
253 extu.w r0,r0
254 addc r0,r7
255 mov #0,r0
256 addc r0,r7
2572:
258 mov r6,r2
259 mov #-5,r0
260 shld r0,r6
261 tst r6,r6
262 bt/s 2f
263 clrt
264 .align 2
2651:
266SRC( mov.l @r4+,r0 )
267SRC( mov.l @r4+,r1 )
268 addc r0,r7
269DST( mov.l r0,@r5 )
270DST( mov.l r1,@(4,r5) )
271 addc r1,r7
272
273SRC( mov.l @r4+,r0 )
274SRC( mov.l @r4+,r1 )
275 addc r0,r7
276DST( mov.l r0,@(8,r5) )
277DST( mov.l r1,@(12,r5) )
278 addc r1,r7
279
280SRC( mov.l @r4+,r0 )
281SRC( mov.l @r4+,r1 )
282 addc r0,r7
283DST( mov.l r0,@(16,r5) )
284DST( mov.l r1,@(20,r5) )
285 addc r1,r7
286
287SRC( mov.l @r4+,r0 )
288SRC( mov.l @r4+,r1 )
289 addc r0,r7
290DST( mov.l r0,@(24,r5) )
291DST( mov.l r1,@(28,r5) )
292 addc r1,r7
293 add #32,r5
294 movt r0
295 dt r6
296 bf/s 1b
297 cmp/eq #1,r0
298 mov #0,r0
299 addc r0,r7
300
3012: mov r2,r6
302 mov #0x1c,r0
303 and r0,r6
304 cmp/pl r6
305 bf/s 4f
306 clrt
307 shlr2 r6
3083:
309SRC( mov.l @r4+,r0 )
310 addc r0,r7
311DST( mov.l r0,@r5 )
312 add #4,r5
313 movt r0
314 dt r6
315 bf/s 3b
316 cmp/eq #1,r0
317 mov #0,r0
318 addc r0,r7
3194: mov r2,r6
320 mov #3,r0
321 and r0,r6
322 cmp/pl r6
323 bf 7f
324 mov #2,r1
325 cmp/hs r1,r6
326 bf 5f
327SRC( mov.w @r4+,r0 )
328DST( mov.w r0,@r5 )
329 extu.w r0,r0
330 add #2,r5
331 cmp/eq r1,r6
332 bt/s 6f
333 clrt
334 shll16 r0
335 addc r0,r7
3365:
337SRC( mov.b @r4+,r0 )
338DST( mov.b r0,@r5 )
339 extu.b r0,r0
340#ifndef __LITTLE_ENDIAN__
341 shll8 r0
342#endif
3436: addc r0,r7
344 mov #0,r0
345 addc r0,r7
3467:
3475000:
348
349# Exception handler:
350.section .fixup, "ax"
351
3526001:
353 mov.l @(8,r15),r0 ! src_err_ptr
354 mov #-EFAULT,r1
355 mov.l r1,@r0
356
357 ! zero the complete destination - computing the rest
358 ! is too much work
359 mov.l @(4,r15),r5 ! dst
360 mov.l @r15,r6 ! len
361 mov #0,r7
3621: mov.b r7,@r5
363 dt r6
364 bf/s 1b
365 add #1,r5
366 mov.l 8000f,r0
367 jmp @r0
368 nop
369 .align 2
3708000: .long 5000b
371
3726002:
373 mov.l @(12,r15),r0 ! dst_err_ptr
374 mov #-EFAULT,r1
375 mov.l r1,@r0
376 mov.l 8001f,r0
377 jmp @r0
378 nop
379 .align 2
3808001: .long 5000b
381
382.previous
383 add #8,r15
384 rts
385 mov r7,r0
diff --git a/arch/sh/lib/delay.c b/arch/sh/lib/delay.c
new file mode 100644
index 000000000000..50b36037d86b
--- /dev/null
+++ b/arch/sh/lib/delay.c
@@ -0,0 +1,41 @@
1/*
2 * Precise Delay Loops for SuperH
3 *
4 * Copyright (C) 1999 Niibe Yutaka & Kaz Kojima
5 */
6
7#include <linux/sched.h>
8#include <linux/delay.h>
9
10void __delay(unsigned long loops)
11{
12 __asm__ __volatile__(
13 "tst %0, %0\n\t"
14 "1:\t"
15 "bf/s 1b\n\t"
16 " dt %0"
17 : "=r" (loops)
18 : "0" (loops)
19 : "t");
20}
21
22inline void __const_udelay(unsigned long xloops)
23{
24 __asm__("dmulu.l %0, %2\n\t"
25 "sts mach, %0"
26 : "=r" (xloops)
27 : "0" (xloops), "r" (cpu_data[_smp_processor_id()].loops_per_jiffy)
28 : "macl", "mach");
29 __delay(xloops * HZ);
30}
31
32void __udelay(unsigned long usecs)
33{
34 __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */
35}
36
37void __ndelay(unsigned long nsecs)
38{
39 __const_udelay(nsecs * 0x00000005);
40}
41
diff --git a/arch/sh/lib/div64-generic.c b/arch/sh/lib/div64-generic.c
new file mode 100644
index 000000000000..c02473afd581
--- /dev/null
+++ b/arch/sh/lib/div64-generic.c
@@ -0,0 +1,19 @@
1/*
2 * Generic __div64_32 wrapper for __xdiv64_32.
3 */
4
5#include <linux/types.h>
6
7extern u64 __xdiv64_32(u64 n, u32 d);
8
9u64 __div64_32(u64 *xp, u32 y)
10{
11 u64 rem;
12 u64 q = __xdiv64_32(*xp, y);
13
14 rem = *xp - q * y;
15 *xp = q;
16
17 return rem;
18}
19
diff --git a/arch/sh/lib/div64.S b/arch/sh/lib/div64.S
new file mode 100644
index 000000000000..eefc275d64a7
--- /dev/null
+++ b/arch/sh/lib/div64.S
@@ -0,0 +1,46 @@
1/*
2 * unsigned long long __xdiv64_32(unsigned long long n, unsigned long d);
3 */
4
5#include <linux/linkage.h>
6
7.text
8ENTRY(__xdiv64_32)
9#ifdef __LITTLE_ENDIAN__
10 mov r4, r0
11 mov r5, r1
12#else
13 mov r4, r1
14 mov r5, r0
15#endif
16 cmp/hs r6, r1
17 bf.s 1f
18 mov #0, r2
19
20 mov r1, r2
21 mov #0, r3
22 div0u
23 .rept 32
24 rotcl r2
25 div1 r6, r3
26 .endr
27 rotcl r2
28 mul.l r6, r2
29 sts macl, r3
30 sub r3, r1
311:
32 div0u
33 .rept 32
34 rotcl r0
35 div1 r6, r1
36 .endr
37#ifdef __LITTLE_ENDIAN__
38 mov r2, r1
39 rts
40 rotcl r0
41#else
42 rotcl r0
43 mov r0, r1
44 rts
45 mov r2, r0
46#endif
diff --git a/arch/sh/lib/memchr.S b/arch/sh/lib/memchr.S
new file mode 100644
index 000000000000..bc6036ad5706
--- /dev/null
+++ b/arch/sh/lib/memchr.S
@@ -0,0 +1,26 @@
1/* $Id: memchr.S,v 1.1 2000/04/14 16:49:01 mjd Exp $
2 *
3 * "memchr" implementation of SuperH
4 *
5 * Copyright (C) 1999 Niibe Yutaka
6 *
7 */
8
9/*
10 * void *memchr(const void *s, int c, size_t n);
11 */
12
13#include <linux/linkage.h>
14ENTRY(memchr)
15 tst r6,r6
16 bt/s 2f
17 exts.b r5,r5
181: mov.b @r4,r1
19 cmp/eq r1,r5
20 bt/s 3f
21 dt r6
22 bf/s 1b
23 add #1,r4
242: mov #0,r4
253: rts
26 mov r4,r0
diff --git a/arch/sh/lib/memcpy-sh4.S b/arch/sh/lib/memcpy-sh4.S
new file mode 100644
index 000000000000..55f227441f9e
--- /dev/null
+++ b/arch/sh/lib/memcpy-sh4.S
@@ -0,0 +1,800 @@
1/*
2 * "memcpy" implementation of SuperH
3 *
4 * Copyright (C) 1999 Niibe Yutaka
5 * Copyright (c) 2002 STMicroelectronics Ltd
6 * Modified from memcpy.S and micro-optimised for SH4
7 * Stuart Menefy (stuart.menefy@st.com)
8 *
9 */
10#include <linux/linkage.h>
11#include <linux/config.h>
12
13/*
14 * void *memcpy(void *dst, const void *src, size_t n);
15 *
16 * It is assumed that there is no overlap between src and dst.
17 * If there is an overlap, then the results are undefined.
18 */
19
20 !
21 ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
22 !
23
24 ! Size is 16 or greater, and may have trailing bytes
25
26 .balign 32
27.Lcase1:
28 ! Read a long word and write a long word at once
29 ! At the start of each iteration, r7 contains last long load
30 add #-1,r5 ! 79 EX
31 mov r4,r2 ! 5 MT (0 cycles latency)
32
33 mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
34 add #-4,r5 ! 50 EX
35
36 add #7,r2 ! 79 EX
37 !
38#ifdef CONFIG_CPU_LITTLE_ENDIAN
39 ! 6 cycles, 4 bytes per iteration
403: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
41 mov r7, r3 ! 5 MT (latency=0) ! RQPO
42
43 cmp/hi r2,r0 ! 57 MT
44 shll16 r3 ! 103 EX
45
46 mov r1,r6 ! 5 MT (latency=0)
47 shll8 r3 ! 102 EX ! Oxxx
48
49 shlr8 r6 ! 106 EX ! xNML
50 mov r1, r7 ! 5 MT (latency=0)
51
52 or r6,r3 ! 82 EX ! ONML
53 bt/s 3b ! 109 BR
54
55 mov.l r3,@-r0 ! 30 LS
56#else
573: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN
58 mov r7,r3 ! 5 MT (latency=0) ! OPQR
59
60 cmp/hi r2,r0 ! 57 MT
61 shlr16 r3 ! 107 EX
62
63 shlr8 r3 ! 106 EX ! xxxO
64 mov r1,r6 ! 5 MT (latency=0)
65
66 shll8 r6 ! 102 EX ! LMNx
67 mov r1,r7 ! 5 MT (latency=0)
68
69 or r6,r3 ! 82 EX ! LMNO
70 bt/s 3b ! 109 BR
71
72 mov.l r3,@-r0 ! 30 LS
73#endif
74 ! Finally, copy a byte at once, if necessary
75
76 add #4,r5 ! 50 EX
77 cmp/eq r4,r0 ! 54 MT
78
79 add #-6,r2 ! 50 EX
80 bt 9f ! 109 BR
81
828: cmp/hi r2,r0 ! 57 MT
83 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
84
85 bt/s 8b ! 109 BR
86
87 mov.b r1,@-r0 ! 29 LS
88
899: rts
90 nop
91
92
93 !
94 ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
95 !
96
97 ! Size is 16 or greater, and may have trailing bytes
98
99 .balign 32
100.Lcase3:
101 ! Read a long word and write a long word at once
102 ! At the start of each iteration, r7 contains last long load
103 add #-3,r5 ! 79 EX
104 mov r4,r2 ! 5 MT (0 cycles latency)
105
106 mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
107 add #-4,r5 ! 50 EX
108
109 add #7,r2 ! 79 EX
110 !
111#ifdef CONFIG_CPU_LITTLE_ENDIAN
112 ! 6 cycles, 4 bytes per iteration
1133: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
114 mov r7, r3 ! 5 MT (latency=0) ! RQPO
115
116 cmp/hi r2,r0 ! 57 MT
117 shll8 r3 ! 102 EX ! QPOx
118
119 mov r1,r6 ! 5 MT (latency=0)
120 shlr16 r6 ! 107 EX
121
122 shlr8 r6 ! 106 EX ! xxxN
123 mov r1, r7 ! 5 MT (latency=0)
124
125 or r6,r3 ! 82 EX ! QPON
126 bt/s 3b ! 109 BR
127
128 mov.l r3,@-r0 ! 30 LS
129#else
1303: mov r1,r3 ! OPQR
131 shlr8 r3 ! xOPQ
132 mov.l @(r0,r5),r1 ! KLMN
133 mov r1,r6
134 shll16 r6
135 shll8 r6 ! Nxxx
136 or r6,r3 ! NOPQ
137 cmp/hi r2,r0
138 bt/s 3b
139 mov.l r3,@-r0
140#endif
141
142 ! Finally, copy a byte at once, if necessary
143
144 add #6,r5 ! 50 EX
145 cmp/eq r4,r0 ! 54 MT
146
147 add #-6,r2 ! 50 EX
148 bt 9f ! 109 BR
149
1508: cmp/hi r2,r0 ! 57 MT
151 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
152
153 bt/s 8b ! 109 BR
154
155 mov.b r1,@-r0 ! 29 LS
156
1579: rts
158 nop
159
160ENTRY(memcpy)
161
162 ! Calculate the invariants which will be used in the remainder
163 ! of the code:
164 !
165 ! r4 --> [ ... ] DST [ ... ] SRC
166 ! [ ... ] [ ... ]
167 ! : :
168 ! r0 --> [ ... ] r0+r5 --> [ ... ]
169 !
170 !
171
172 ! Short circuit the common case of src, dst and len being 32 bit aligned
173 ! and test for zero length move
174
175 mov r6, r0 ! 5 MT (0 cycle latency)
176 or r4, r0 ! 82 EX
177
178 or r5, r0 ! 82 EX
179 tst r6, r6 ! 86 MT
180
181 bt/s 99f ! 111 BR (zero len)
182 tst #3, r0 ! 87 MT
183
184 mov r4, r0 ! 5 MT (0 cycle latency)
185 add r6, r0 ! 49 EX
186
187 mov #16, r1 ! 6 EX
188 bt/s .Lcase00 ! 111 BR (aligned)
189
190 sub r4, r5 ! 75 EX
191
192 ! Arguments are not nicely long word aligned or zero len.
193 ! Check for small copies, and if so do a simple byte at a time copy.
194 !
195 ! Deciding on an exact value of 'small' is not easy, as the point at which
196 ! using the optimised routines become worthwhile varies (these are the
197 ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
198 ! size byte-at-time long word byte
199 ! 16 42 39-40 46-50 50-55
200 ! 24 58 43-44 54-58 62-67
201 ! 36 82 49-50 66-70 80-85
202 ! However the penalty for getting it 'wrong' is much higher for long word
203 ! aligned data (and this is more common), so use a value of 16.
204
205 cmp/gt r6,r1 ! 56 MT
206
207 add #-1,r5 ! 50 EX
208 bf/s 6f ! 108 BR (not small)
209
210 mov r5, r3 ! 5 MT (latency=0)
211 shlr r6 ! 104 EX
212
213 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
214 bf/s 4f ! 111 BR
215
216 add #-1,r3 ! 50 EX
217 tst r6, r6 ! 86 MT
218
219 bt/s 98f ! 110 BR
220 mov.b r1,@-r0 ! 29 LS
221
222 ! 4 cycles, 2 bytes per iteration
2233: mov.b @(r0,r5),r1 ! 20 LS (latency=2)
224
2254: mov.b @(r0,r3),r2 ! 20 LS (latency=2)
226 dt r6 ! 67 EX
227
228 mov.b r1,@-r0 ! 29 LS
229 bf/s 3b ! 111 BR
230
231 mov.b r2,@-r0 ! 29 LS
23298:
233 rts
234 nop
235
23699: rts
237 mov r4, r0
238
239 ! Size is not small, so its worthwhile looking for optimisations.
240 ! First align destination to a long word boundary.
241 !
242 ! r5 = normal value -1
243
2446: tst #3, r0 ! 87 MT
245 mov #3, r3 ! 6 EX
246
247 bt/s 2f ! 111 BR
248 and r0,r3 ! 78 EX
249
250 ! 3 cycles, 1 byte per iteration
2511: dt r3 ! 67 EX
252 mov.b @(r0,r5),r1 ! 19 LS (latency=2)
253
254 add #-1, r6 ! 79 EX
255 bf/s 1b ! 109 BR
256
257 mov.b r1,@-r0 ! 28 LS
258
2592: add #1, r5 ! 79 EX
260
261 ! Now select the appropriate bulk transfer code based on relative
262 ! alignment of src and dst.
263
264 mov r0, r3 ! 5 MT (latency=0)
265
266 mov r5, r0 ! 5 MT (latency=0)
267 tst #1, r0 ! 87 MT
268
269 bf/s 1f ! 111 BR
270 mov #64, r7 ! 6 EX
271
272 ! bit 0 clear
273
274 cmp/ge r7, r6 ! 55 MT
275
276 bt/s 2f ! 111 BR
277 tst #2, r0 ! 87 MT
278
279 ! small
280 bt/s .Lcase0
281 mov r3, r0
282
283 bra .Lcase2
284 nop
285
286 ! big
2872: bt/s .Lcase0b
288 mov r3, r0
289
290 bra .Lcase2b
291 nop
292
293 ! bit 0 set
2941: tst #2, r0 ! 87 MT
295
296 bt/s .Lcase1
297 mov r3, r0
298
299 bra .Lcase3
300 nop
301
302
303 !
304 ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
305 !
306
307 ! src, dst and size are all long word aligned
308 ! size is non-zero
309
310 .balign 32
311.Lcase00:
312 mov #64, r1 ! 6 EX
313 mov r5, r3 ! 5 MT (latency=0)
314
315 cmp/gt r6, r1 ! 56 MT
316 add #-4, r5 ! 50 EX
317
318 bf .Lcase00b ! 108 BR (big loop)
319 shlr2 r6 ! 105 EX
320
321 shlr r6 ! 104 EX
322 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
323
324 bf/s 4f ! 111 BR
325 add #-8, r3 ! 50 EX
326
327 tst r6, r6 ! 86 MT
328 bt/s 5f ! 110 BR
329
330 mov.l r1,@-r0 ! 30 LS
331
332 ! 4 cycles, 2 long words per iteration
3333: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
334
3354: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
336 dt r6 ! 67 EX
337
338 mov.l r1, @-r0 ! 30 LS
339 bf/s 3b ! 109 BR
340
341 mov.l r2, @-r0 ! 30 LS
342
3435: rts
344 nop
345
346
347 ! Size is 16 or greater and less than 64, but may have trailing bytes
348
349 .balign 32
350.Lcase0:
351 add #-4, r5 ! 50 EX
352 mov r4, r7 ! 5 MT (latency=0)
353
354 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
355 mov #4, r2 ! 6 EX
356
357 add #11, r7 ! 50 EX
358 tst r2, r6 ! 86 MT
359
360 mov r5, r3 ! 5 MT (latency=0)
361 bt/s 4f ! 111 BR
362
363 add #-4, r3 ! 50 EX
364 mov.l r1,@-r0 ! 30 LS
365
366 ! 4 cycles, 2 long words per iteration
3673: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
368
3694: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
370 cmp/hi r7, r0
371
372 mov.l r1, @-r0 ! 30 LS
373 bt/s 3b ! 109 BR
374
375 mov.l r2, @-r0 ! 30 LS
376
377 ! Copy the final 0-3 bytes
378
379 add #3,r5 ! 50 EX
380
381 cmp/eq r0, r4 ! 54 MT
382 add #-10, r7 ! 50 EX
383
384 bt 9f ! 110 BR
385
386 ! 3 cycles, 1 byte per iteration
3871: mov.b @(r0,r5),r1 ! 19 LS
388 cmp/hi r7,r0 ! 57 MT
389
390 bt/s 1b ! 111 BR
391 mov.b r1,@-r0 ! 28 LS
392
3939: rts
394 nop
395
396 ! Size is at least 64 bytes, so will be going round the big loop at least once.
397 !
398 ! r2 = rounded up r4
399 ! r3 = rounded down r0
400
401 .balign 32
402.Lcase0b:
403 add #-4, r5 ! 50 EX
404
405.Lcase00b:
406 mov r0, r3 ! 5 MT (latency=0)
407 mov #(~0x1f), r1 ! 6 EX
408
409 and r1, r3 ! 78 EX
410 mov r4, r2 ! 5 MT (latency=0)
411
412 cmp/eq r3, r0 ! 54 MT
413 add #0x1f, r2 ! 50 EX
414
415 bt/s 1f ! 110 BR
416 and r1, r2 ! 78 EX
417
418 ! copy initial words until cache line aligned
419
420 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
421 tst #4, r0 ! 87 MT
422
423 mov r5, r6 ! 5 MT (latency=0)
424 add #-4, r6 ! 50 EX
425
426 bt/s 4f ! 111 BR
427 add #8, r3 ! 50 EX
428
429 tst #0x18, r0 ! 87 MT
430
431 bt/s 1f ! 109 BR
432 mov.l r1,@-r0 ! 30 LS
433
434 ! 4 cycles, 2 long words per iteration
4353: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
436
4374: mov.l @(r0, r6), r7 ! 21 LS (latency=2)
438 cmp/eq r3, r0 ! 54 MT
439
440 mov.l r1, @-r0 ! 30 LS
441 bf/s 3b ! 109 BR
442
443 mov.l r7, @-r0 ! 30 LS
444
445 ! Copy the cache line aligned blocks
446 !
447 ! In use: r0, r2, r4, r5
448 ! Scratch: r1, r3, r6, r7
449 !
450 ! We could do this with the four scratch registers, but if src
451 ! and dest hit the same cache line, this will thrash, so make
452 ! use of additional registers.
453 !
454 ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
455 ! r5: src (was r0+r5)
456 ! r1: dest (was r0)
457 ! this can be reversed at the end, so we don't need to save any extra
458 ! state.
459 !
4601: mov.l r8, @-r15 ! 30 LS
461 add r0, r5 ! 49 EX
462
463 mov.l r9, @-r15 ! 30 LS
464 mov r0, r1 ! 5 MT (latency=0)
465
466 mov.l r10, @-r15 ! 30 LS
467 add #-0x1c, r5 ! 50 EX
468
469 mov.l r11, @-r15 ! 30 LS
470
471 ! 16 cycles, 32 bytes per iteration
4722: mov.l @(0x00,r5),r0 ! 18 LS (latency=2)
473 add #-0x20, r1 ! 50 EX
474 mov.l @(0x04,r5),r3 ! 18 LS (latency=2)
475 mov.l @(0x08,r5),r6 ! 18 LS (latency=2)
476 mov.l @(0x0c,r5),r7 ! 18 LS (latency=2)
477 mov.l @(0x10,r5),r8 ! 18 LS (latency=2)
478 mov.l @(0x14,r5),r9 ! 18 LS (latency=2)
479 mov.l @(0x18,r5),r10 ! 18 LS (latency=2)
480 mov.l @(0x1c,r5),r11 ! 18 LS (latency=2)
481 movca.l r0,@r1 ! 40 LS (latency=3-7)
482 mov.l r3,@(0x04,r1) ! 33 LS
483 mov.l r6,@(0x08,r1) ! 33 LS
484 mov.l r7,@(0x0c,r1) ! 33 LS
485
486 mov.l r8,@(0x10,r1) ! 33 LS
487 add #-0x20, r5 ! 50 EX
488
489 mov.l r9,@(0x14,r1) ! 33 LS
490 cmp/eq r2,r1 ! 54 MT
491
492 mov.l r10,@(0x18,r1) ! 33 LS
493 bf/s 2b ! 109 BR
494
495 mov.l r11,@(0x1c,r1) ! 33 LS
496
497 mov r1, r0 ! 5 MT (latency=0)
498
499 mov.l @r15+, r11 ! 15 LS
500 sub r1, r5 ! 75 EX
501
502 mov.l @r15+, r10 ! 15 LS
503 cmp/eq r4, r0 ! 54 MT
504
505 bf/s 1f ! 109 BR
506 mov.l @r15+, r9 ! 15 LS
507
508 rts
5091: mov.l @r15+, r8 ! 15 LS
510 sub r4, r1 ! 75 EX (len remaining)
511
512 ! number of trailing bytes is non-zero
513 !
514 ! invariants restored (r5 already decremented by 4)
515 ! also r1=num bytes remaining
516
517 mov #4, r2 ! 6 EX
518 mov r4, r7 ! 5 MT (latency=0)
519
520 add #0x1c, r5 ! 50 EX (back to -4)
521 cmp/hs r2, r1 ! 58 MT
522
523 bf/s 5f ! 108 BR
524 add #11, r7 ! 50 EX
525
526 mov.l @(r0, r5), r6 ! 21 LS (latency=2)
527 tst r2, r1 ! 86 MT
528
529 mov r5, r3 ! 5 MT (latency=0)
530 bt/s 4f ! 111 BR
531
532 add #-4, r3 ! 50 EX
533 cmp/hs r2, r1 ! 58 MT
534
535 bt/s 5f ! 111 BR
536 mov.l r6,@-r0 ! 30 LS
537
538 ! 4 cycles, 2 long words per iteration
5393: mov.l @(r0, r5), r6 ! 21 LS (latency=2)
540
5414: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
542 cmp/hi r7, r0
543
544 mov.l r6, @-r0 ! 30 LS
545 bt/s 3b ! 109 BR
546
547 mov.l r2, @-r0 ! 30 LS
548
549 ! Copy the final 0-3 bytes
550
5515: cmp/eq r0, r4 ! 54 MT
552 add #-10, r7 ! 50 EX
553
554 bt 9f ! 110 BR
555 add #3,r5 ! 50 EX
556
557 ! 3 cycles, 1 byte per iteration
5581: mov.b @(r0,r5),r1 ! 19 LS
559 cmp/hi r7,r0 ! 57 MT
560
561 bt/s 1b ! 111 BR
562 mov.b r1,@-r0 ! 28 LS
563
5649: rts
565 nop
566
567 !
568 ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
569 !
570
571 .balign 32
572.Lcase2:
573 ! Size is 16 or greater and less then 64, but may have trailing bytes
574
5752: mov r5, r6 ! 5 MT (latency=0)
576 add #-2,r5 ! 50 EX
577
578 mov r4,r2 ! 5 MT (latency=0)
579 add #-4,r6 ! 50 EX
580
581 add #7,r2 ! 50 EX
5823: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
583
584 mov.w @(r0,r6),r3 ! 20 LS (latency=2)
585 cmp/hi r2,r0 ! 57 MT
586
587 mov.w r1,@-r0 ! 29 LS
588 bt/s 3b ! 111 BR
589
590 mov.w r3,@-r0 ! 29 LS
591
592 bra 10f
593 nop
594
595
596 .balign 32
597.Lcase2b:
598 ! Size is at least 64 bytes, so will be going round the big loop at least once.
599 !
600 ! r2 = rounded up r4
601 ! r3 = rounded down r0
602
603 mov r0, r3 ! 5 MT (latency=0)
604 mov #(~0x1f), r1 ! 6 EX
605
606 and r1, r3 ! 78 EX
607 mov r4, r2 ! 5 MT (latency=0)
608
609 cmp/eq r3, r0 ! 54 MT
610 add #0x1f, r2 ! 50 EX
611
612 add #-2, r5 ! 50 EX
613 bt/s 1f ! 110 BR
614 and r1, r2 ! 78 EX
615
616 ! Copy a short word one at a time until we are cache line aligned
617 ! Normal values: r0, r2, r3, r4
618 ! Unused: r1, r6, r7
619 ! Mod: r5 (=r5-2)
620 !
621 add #2, r3 ! 50 EX
622
6232: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
624 cmp/eq r3,r0 ! 54 MT
625
626 bf/s 2b ! 111 BR
627
628 mov.w r1,@-r0 ! 29 LS
629
630 ! Copy the cache line aligned blocks
631 !
632 ! In use: r0, r2, r4, r5 (=r5-2)
633 ! Scratch: r1, r3, r6, r7
634 !
635 ! We could do this with the four scratch registers, but if src
636 ! and dest hit the same cache line, this will thrash, so make
637 ! use of additional registers.
638 !
639 ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
640 ! r5: src (was r0+r5)
641 ! r1: dest (was r0)
642 ! this can be reversed at the end, so we don't need to save any extra
643 ! state.
644 !
6451: mov.l r8, @-r15 ! 30 LS
646 add r0, r5 ! 49 EX
647
648 mov.l r9, @-r15 ! 30 LS
649 mov r0, r1 ! 5 MT (latency=0)
650
651 mov.l r10, @-r15 ! 30 LS
652 add #-0x1e, r5 ! 50 EX
653
654 mov.l r11, @-r15 ! 30 LS
655
656 mov.l r12, @-r15 ! 30 LS
657
658 ! 17 cycles, 32 bytes per iteration
659#ifdef CONFIG_CPU_LITTLE_ENDIAN
6602: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI
661 add #-0x20, r1 ! 50 EX
662
663 mov.l @r5+, r3 ! 15 LS (latency=2) NMLK
664
665 mov.l @r5+, r6 ! 15 LS (latency=2) RQPO
666 shll16 r0 ! 103 EX JI..
667
668 mov.l @r5+, r7 ! 15 LS (latency=2)
669 xtrct r3, r0 ! 48 EX LKJI
670
671 mov.l @r5+, r8 ! 15 LS (latency=2)
672 xtrct r6, r3 ! 48 EX PONM
673
674 mov.l @r5+, r9 ! 15 LS (latency=2)
675 xtrct r7, r6 ! 48 EX
676
677 mov.l @r5+, r10 ! 15 LS (latency=2)
678 xtrct r8, r7 ! 48 EX
679
680 mov.l @r5+, r11 ! 15 LS (latency=2)
681 xtrct r9, r8 ! 48 EX
682
683 mov.w @r5+, r12 ! 15 LS (latency=2)
684 xtrct r10, r9 ! 48 EX
685
686 movca.l r0,@r1 ! 40 LS (latency=3-7)
687 xtrct r11, r10 ! 48 EX
688
689 mov.l r3, @(0x04,r1) ! 33 LS
690 xtrct r12, r11 ! 48 EX
691
692 mov.l r6, @(0x08,r1) ! 33 LS
693
694 mov.l r7, @(0x0c,r1) ! 33 LS
695
696 mov.l r8, @(0x10,r1) ! 33 LS
697 add #-0x40, r5 ! 50 EX
698
699 mov.l r9, @(0x14,r1) ! 33 LS
700 cmp/eq r2,r1 ! 54 MT
701
702 mov.l r10, @(0x18,r1) ! 33 LS
703 bf/s 2b ! 109 BR
704
705 mov.l r11, @(0x1c,r1) ! 33 LS
706#else
7072: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2)
708 add #-2, r5 ! 50 EX
709
710 mov.l @(0x1c,r5), r3 ! 18 LS (latency=2)
711 add #-4, r1 ! 50 EX
712
713 mov.l @(0x18,r5), r6 ! 18 LS (latency=2)
714 shll16 r0 ! 103 EX
715
716 mov.l @(0x14,r5), r7 ! 18 LS (latency=2)
717 xtrct r3, r0 ! 48 EX
718
719 mov.l @(0x10,r5), r8 ! 18 LS (latency=2)
720 xtrct r6, r3 ! 48 EX
721
722 mov.l @(0x0c,r5), r9 ! 18 LS (latency=2)
723 xtrct r7, r6 ! 48 EX
724
725 mov.l @(0x08,r5), r10 ! 18 LS (latency=2)
726 xtrct r8, r7 ! 48 EX
727
728 mov.l @(0x04,r5), r11 ! 18 LS (latency=2)
729 xtrct r9, r8 ! 48 EX
730
731 mov.w @(0x02,r5), r12 ! 18 LS (latency=2)
732 xtrct r10, r9 ! 48 EX
733
734 movca.l r0,@r1 ! 40 LS (latency=3-7)
735 add #-0x1c, r1 ! 50 EX
736
737 mov.l r3, @(0x1c,r1) ! 33 LS
738 xtrct r11, r10 ! 48 EX
739
740 mov.l r6, @(0x18,r1) ! 33 LS
741 xtrct r12, r11 ! 48 EX
742
743 mov.l r7, @(0x14,r1) ! 33 LS
744
745 mov.l r8, @(0x10,r1) ! 33 LS
746 add #-0x3e, r5 ! 50 EX
747
748 mov.l r9, @(0x0c,r1) ! 33 LS
749 cmp/eq r2,r1 ! 54 MT
750
751 mov.l r10, @(0x08,r1) ! 33 LS
752 bf/s 2b ! 109 BR
753
754 mov.l r11, @(0x04,r1) ! 33 LS
755#endif
756
757 mov.l @r15+, r12
758 mov r1, r0 ! 5 MT (latency=0)
759
760 mov.l @r15+, r11 ! 15 LS
761 sub r1, r5 ! 75 EX
762
763 mov.l @r15+, r10 ! 15 LS
764 cmp/eq r4, r0 ! 54 MT
765
766 bf/s 1f ! 109 BR
767 mov.l @r15+, r9 ! 15 LS
768
769 rts
7701: mov.l @r15+, r8 ! 15 LS
771
772 add #0x1e, r5 ! 50 EX
773
774 ! Finish off a short word at a time
775 ! r5 must be invariant - 2
77610: mov r4,r2 ! 5 MT (latency=0)
777 add #1,r2 ! 50 EX
778
779 cmp/hi r2, r0 ! 57 MT
780 bf/s 1f ! 109 BR
781
782 add #2, r2 ! 50 EX
783
7843: mov.w @(r0,r5),r1 ! 20 LS
785 cmp/hi r2,r0 ! 57 MT
786
787 bt/s 3b ! 109 BR
788
789 mov.w r1,@-r0 ! 29 LS
7901:
791
792 !
793 ! Finally, copy the last byte if necessary
794 cmp/eq r4,r0 ! 54 MT
795 bt/s 9b
796 add #1,r5
797 mov.b @(r0,r5),r1
798 rts
799 mov.b r1,@-r0
800
diff --git a/arch/sh/lib/memcpy.S b/arch/sh/lib/memcpy.S
new file mode 100644
index 000000000000..232fab34c261
--- /dev/null
+++ b/arch/sh/lib/memcpy.S
@@ -0,0 +1,227 @@
1/* $Id: memcpy.S,v 1.3 2001/07/27 11:50:52 gniibe Exp $
2 *
3 * "memcpy" implementation of SuperH
4 *
5 * Copyright (C) 1999 Niibe Yutaka
6 *
7 */
8
9/*
10 * void *memcpy(void *dst, const void *src, size_t n);
11 * No overlap between the memory of DST and of SRC are assumed.
12 */
13
14#include <linux/linkage.h>
15ENTRY(memcpy)
16 tst r6,r6
17 bt/s 9f ! if n=0, do nothing
18 mov r4,r0
19 sub r4,r5 ! From here, r5 has the distance to r0
20 add r6,r0 ! From here, r0 points the end of copying point
21 mov #12,r1
22 cmp/gt r6,r1
23 bt/s 7f ! if it's too small, copy a byte at once
24 add #-1,r5
25 add #1,r5
26 ! From here, r6 is free
27 !
28 ! r4 --> [ ... ] DST [ ... ] SRC
29 ! [ ... ] [ ... ]
30 ! : :
31 ! r0 --> [ ... ] r0+r5 --> [ ... ]
32 !
33 !
34 mov r5,r1
35 mov #3,r2
36 and r2,r1
37 shll2 r1
38 mov r0,r3 ! Save the value on R0 to R3
39 mova jmptable,r0
40 add r1,r0
41 mov.l @r0,r1
42 jmp @r1
43 mov r3,r0 ! and back to R0
44 .balign 4
45jmptable:
46 .long case0
47 .long case1
48 .long case2
49 .long case3
50
51 ! copy a byte at once
527: mov r4,r2
53 add #1,r2
548:
55 cmp/hi r2,r0
56 mov.b @(r0,r5),r1
57 bt/s 8b ! while (r0>r2)
58 mov.b r1,@-r0
599:
60 rts
61 nop
62
63case0:
64 !
65 ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
66 !
67 ! First, align to long word boundary
68 mov r0,r3
69 and r2,r3
70 tst r3,r3
71 bt/s 2f
72 add #-4,r5
73 add #3,r5
741: dt r3
75 mov.b @(r0,r5),r1
76 bf/s 1b
77 mov.b r1,@-r0
78 !
79 add #-3,r5
802: ! Second, copy a long word at once
81 mov r4,r2
82 add #7,r2
833: mov.l @(r0,r5),r1
84 cmp/hi r2,r0
85 bt/s 3b
86 mov.l r1,@-r0
87 !
88 ! Third, copy a byte at once, if necessary
89 cmp/eq r4,r0
90 bt/s 9b
91 add #3,r5
92 bra 8b
93 add #-6,r2
94
95case1:
96 !
97 ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
98 !
99 ! First, align to long word boundary
100 mov r0,r3
101 and r2,r3
102 tst r3,r3
103 bt/s 2f
104 add #-1,r5
1051: dt r3
106 mov.b @(r0,r5),r1
107 bf/s 1b
108 mov.b r1,@-r0
109 !
1102: ! Second, read a long word and write a long word at once
111 mov.l @(r0,r5),r1
112 add #-4,r5
113 mov r4,r2
114 add #7,r2
115 !
116#ifdef __LITTLE_ENDIAN__
1173: mov r1,r3 ! RQPO
118 shll16 r3
119 shll8 r3 ! Oxxx
120 mov.l @(r0,r5),r1 ! NMLK
121 mov r1,r6
122 shlr8 r6 ! xNML
123 or r6,r3 ! ONML
124 cmp/hi r2,r0
125 bt/s 3b
126 mov.l r3,@-r0
127#else
1283: mov r1,r3 ! OPQR
129 shlr16 r3
130 shlr8 r3 ! xxxO
131 mov.l @(r0,r5),r1 ! KLMN
132 mov r1,r6
133 shll8 r6 ! LMNx
134 or r6,r3 ! LMNO
135 cmp/hi r2,r0
136 bt/s 3b
137 mov.l r3,@-r0
138#endif
139 !
140 ! Third, copy a byte at once, if necessary
141 cmp/eq r4,r0
142 bt/s 9b
143 add #4,r5
144 bra 8b
145 add #-6,r2
146
147case2:
148 !
149 ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
150 !
151 ! First, align to word boundary
152 tst #1,r0
153 bt/s 2f
154 add #-1,r5
155 mov.b @(r0,r5),r1
156 mov.b r1,@-r0
157 !
1582: ! Second, read a word and write a word at once
159 add #-1,r5
160 mov r4,r2
161 add #3,r2
162 !
1633: mov.w @(r0,r5),r1
164 cmp/hi r2,r0
165 bt/s 3b
166 mov.w r1,@-r0
167 !
168 ! Third, copy a byte at once, if necessary
169 cmp/eq r4,r0
170 bt/s 9b
171 add #1,r5
172 mov.b @(r0,r5),r1
173 rts
174 mov.b r1,@-r0
175
176case3:
177 !
178 ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
179 !
180 ! First, align to long word boundary
181 mov r0,r3
182 and r2,r3
183 tst r3,r3
184 bt/s 2f
185 add #-1,r5
1861: dt r3
187 mov.b @(r0,r5),r1
188 bf/s 1b
189 mov.b r1,@-r0
190 !
1912: ! Second, read a long word and write a long word at once
192 add #-2,r5
193 mov.l @(r0,r5),r1
194 add #-4,r5
195 mov r4,r2
196 add #7,r2
197 !
198#ifdef __LITTLE_ENDIAN__
1993: mov r1,r3 ! RQPO
200 shll8 r3 ! QPOx
201 mov.l @(r0,r5),r1 ! NMLK
202 mov r1,r6
203 shlr16 r6
204 shlr8 r6 ! xxxN
205 or r6,r3 ! QPON
206 cmp/hi r2,r0
207 bt/s 3b
208 mov.l r3,@-r0
209#else
2103: mov r1,r3 ! OPQR
211 shlr8 r3 ! xOPQ
212 mov.l @(r0,r5),r1 ! KLMN
213 mov r1,r6
214 shll16 r6
215 shll8 r6 ! Nxxx
216 or r6,r3 ! NOPQ
217 cmp/hi r2,r0
218 bt/s 3b
219 mov.l r3,@-r0
220#endif
221 !
222 ! Third, copy a byte at once, if necessary
223 cmp/eq r4,r0
224 bt/s 9b
225 add #6,r5
226 bra 8b
227 add #-6,r2
diff --git a/arch/sh/lib/memmove.S b/arch/sh/lib/memmove.S
new file mode 100644
index 000000000000..5a2211f09202
--- /dev/null
+++ b/arch/sh/lib/memmove.S
@@ -0,0 +1,254 @@
1/* $Id: memmove.S,v 1.2 2001/07/27 11:51:09 gniibe Exp $
2 *
3 * "memmove" implementation of SuperH
4 *
5 * Copyright (C) 1999 Niibe Yutaka
6 *
7 */
8
9/*
10 * void *memmove(void *dst, const void *src, size_t n);
11 * The memory areas may overlap.
12 */
13
14#include <linux/linkage.h>
15ENTRY(memmove)
16 ! if dest > src, call memcpy (it copies in decreasing order)
17 cmp/hi r5,r4
18 bf 1f
19 mov.l 2f,r0
20 jmp @r0
21 nop
22 .balign 4
232: .long memcpy
241:
25 sub r5,r4 ! From here, r4 has the distance to r0
26 tst r6,r6
27 bt/s 9f ! if n=0, do nothing
28 mov r5,r0
29 add r6,r5
30 mov #12,r1
31 cmp/gt r6,r1
32 bt/s 8f ! if it's too small, copy a byte at once
33 add #-1,r4
34 add #1,r4
35 !
36 ! [ ... ] DST [ ... ] SRC
37 ! [ ... ] [ ... ]
38 ! : :
39 ! r0+r4--> [ ... ] r0 --> [ ... ]
40 ! : :
41 ! [ ... ] [ ... ]
42 ! r5 -->
43 !
44 mov r4,r1
45 mov #3,r2
46 and r2,r1
47 shll2 r1
48 mov r0,r3 ! Save the value on R0 to R3
49 mova jmptable,r0
50 add r1,r0
51 mov.l @r0,r1
52 jmp @r1
53 mov r3,r0 ! and back to R0
54 .balign 4
55jmptable:
56 .long case0
57 .long case1
58 .long case2
59 .long case3
60
61 ! copy a byte at once
628: mov.b @r0+,r1
63 cmp/hs r5,r0
64 bf/s 8b ! while (r0<r5)
65 mov.b r1,@(r0,r4)
66 add #1,r4
679:
68 add r4,r0
69 rts
70 sub r6,r0
71
72case_none:
73 bra 8b
74 add #-1,r4
75
76case0:
77 !
78 ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
79 !
80 ! First, align to long word boundary
81 mov r0,r3
82 and r2,r3
83 tst r3,r3
84 bt/s 2f
85 add #-1,r4
86 mov #4,r2
87 sub r3,r2
881: dt r2
89 mov.b @r0+,r1
90 bf/s 1b
91 mov.b r1,@(r0,r4)
92 !
932: ! Second, copy a long word at once
94 add #-3,r4
95 add #-3,r5
963: mov.l @r0+,r1
97 cmp/hs r5,r0
98 bf/s 3b
99 mov.l r1,@(r0,r4)
100 add #3,r5
101 !
102 ! Third, copy a byte at once, if necessary
103 cmp/eq r5,r0
104 bt/s 9b
105 add #4,r4
106 bra 8b
107 add #-1,r4
108
109case3:
110 !
111 ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
112 !
113 ! First, align to long word boundary
114 mov r0,r3
115 and r2,r3
116 tst r3,r3
117 bt/s 2f
118 add #-1,r4
119 mov #4,r2
120 sub r3,r2
1211: dt r2
122 mov.b @r0+,r1
123 bf/s 1b
124 mov.b r1,@(r0,r4)
125 !
1262: ! Second, read a long word and write a long word at once
127 add #-2,r4
128 mov.l @(r0,r4),r1
129 add #-7,r5
130 add #-4,r4
131 !
132#ifdef __LITTLE_ENDIAN__
133 shll8 r1
1343: mov r1,r3 ! JIHG
135 shlr8 r3 ! xJIH
136 mov.l @r0+,r1 ! NMLK
137 mov r1,r2
138 shll16 r2
139 shll8 r2 ! Kxxx
140 or r2,r3 ! KJIH
141 cmp/hs r5,r0
142 bf/s 3b
143 mov.l r3,@(r0,r4)
144#else
145 shlr8 r1
1463: mov r1,r3 ! GHIJ
147 shll8 r3 ! HIJx
148 mov.l @r0+,r1 ! KLMN
149 mov r1,r2
150 shlr16 r2
151 shlr8 r2 ! xxxK
152 or r2,r3 ! HIJK
153 cmp/hs r5,r0
154 bf/s 3b
155 mov.l r3,@(r0,r4)
156#endif
157 add #7,r5
158 !
159 ! Third, copy a byte at once, if necessary
160 cmp/eq r5,r0
161 bt/s 9b
162 add #7,r4
163 add #-3,r0
164 bra 8b
165 add #-1,r4
166
167case2:
168 !
169 ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
170 !
171 ! First, align to word boundary
172 tst #1,r0
173 bt/s 2f
174 add #-1,r4
175 mov.b @r0+,r1
176 mov.b r1,@(r0,r4)
177 !
1782: ! Second, read a word and write a word at once
179 add #-1,r4
180 add #-1,r5
181 !
1823: mov.w @r0+,r1
183 cmp/hs r5,r0
184 bf/s 3b
185 mov.w r1,@(r0,r4)
186 add #1,r5
187 !
188 ! Third, copy a byte at once, if necessary
189 cmp/eq r5,r0
190 bt/s 9b
191 add #2,r4
192 mov.b @r0,r1
193 mov.b r1,@(r0,r4)
194 bra 9b
195 add #1,r0
196
197case1:
198 !
199 ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
200 !
201 ! First, align to long word boundary
202 mov r0,r3
203 and r2,r3
204 tst r3,r3
205 bt/s 2f
206 add #-1,r4
207 mov #4,r2
208 sub r3,r2
2091: dt r2
210 mov.b @r0+,r1
211 bf/s 1b
212 mov.b r1,@(r0,r4)
213 !
2142: ! Second, read a long word and write a long word at once
215 mov.l @(r0,r4),r1
216 add #-7,r5
217 add #-4,r4
218 !
219#ifdef __LITTLE_ENDIAN__
220 shll16 r1
221 shll8 r1
2223: mov r1,r3 ! JIHG
223 shlr16 r3
224 shlr8 r3 ! xxxJ
225 mov.l @r0+,r1 ! NMLK
226 mov r1,r2
227 shll8 r2 ! MLKx
228 or r2,r3 ! MLKJ
229 cmp/hs r5,r0
230 bf/s 3b
231 mov.l r3,@(r0,r4)
232#else
233 shlr16 r1
234 shlr8 r1
2353: mov r1,r3 ! GHIJ
236 shll16 r3
237 shll8 r3 ! Jxxx
238 mov.l @r0+,r1 ! KLMN
239 mov r1,r2
240 shlr8 r2 ! xKLM
241 or r2,r3 ! JKLM
242 cmp/hs r5,r0
243 bf/s 3b ! while(r0<r5)
244 mov.l r3,@(r0,r4)
245#endif
246 add #7,r5
247 !
248 ! Third, copy a byte at once, if necessary
249 cmp/eq r5,r0
250 bt/s 9b
251 add #5,r4
252 add #-3,r0
253 bra 8b
254 add #-1,r4
diff --git a/arch/sh/lib/memset.S b/arch/sh/lib/memset.S
new file mode 100644
index 000000000000..95670090680e
--- /dev/null
+++ b/arch/sh/lib/memset.S
@@ -0,0 +1,57 @@
1/* $Id: memset.S,v 1.1 2000/04/14 16:49:01 mjd Exp $
2 *
3 * "memset" implementation of SuperH
4 *
5 * Copyright (C) 1999 Niibe Yutaka
6 *
7 */
8
9/*
10 * void *memset(void *s, int c, size_t n);
11 */
12
13#include <linux/linkage.h>
14
15ENTRY(memset)
16 tst r6,r6
17 bt/s 5f ! if n=0, do nothing
18 add r6,r4
19 mov #12,r0
20 cmp/gt r6,r0
21 bt/s 4f ! if it's too small, set a byte at once
22 mov r4,r0
23 and #3,r0
24 cmp/eq #0,r0
25 bt/s 2f ! It's aligned
26 sub r0,r6
271:
28 dt r0
29 bf/s 1b
30 mov.b r5,@-r4
312: ! make VVVV
32 swap.b r5,r0 ! V0
33 or r0,r5 ! VV
34 swap.w r5,r0 ! VV00
35 or r0,r5 ! VVVV
36 !
37 mov r6,r0
38 shlr2 r0
39 shlr r0 ! r0 = r6 >> 3
403:
41 dt r0
42 mov.l r5,@-r4 ! set 8-byte at once
43 bf/s 3b
44 mov.l r5,@-r4
45 !
46 mov #7,r0
47 and r0,r6
48 tst r6,r6
49 bt 5f
50 ! fill bytes
514:
52 dt r6
53 bf/s 4b
54 mov.b r5,@-r4
555:
56 rts
57 mov r4,r0
diff --git a/arch/sh/lib/strcasecmp.c b/arch/sh/lib/strcasecmp.c
new file mode 100644
index 000000000000..4e57a216feaf
--- /dev/null
+++ b/arch/sh/lib/strcasecmp.c
@@ -0,0 +1,26 @@
1/*
2 * linux/arch/alpha/lib/strcasecmp.c
3 */
4
5#include <linux/string.h>
6
7
8/* We handle nothing here except the C locale. Since this is used in
9 only one place, on strings known to contain only 7 bit ASCII, this
10 is ok. */
11
12int strcasecmp(const char *a, const char *b)
13{
14 int ca, cb;
15
16 do {
17 ca = *a++ & 0xff;
18 cb = *b++ & 0xff;
19 if (ca >= 'A' && ca <= 'Z')
20 ca += 'a' - 'A';
21 if (cb >= 'A' && cb <= 'Z')
22 cb += 'a' - 'A';
23 } while (ca == cb && ca != '\0');
24
25 return ca - cb;
26}
diff --git a/arch/sh/lib/strlen.S b/arch/sh/lib/strlen.S
new file mode 100644
index 000000000000..f8ab296047b3
--- /dev/null
+++ b/arch/sh/lib/strlen.S
@@ -0,0 +1,70 @@
1/* $Id: strlen.S,v 1.2 2001/06/29 14:07:15 gniibe Exp $
2 *
3 * "strlen" implementation of SuperH
4 *
5 * Copyright (C) 1999 Kaz Kojima
6 *
7 */
8
9/* size_t strlen (const char *s) */
10
11#include <linux/linkage.h>
12ENTRY(strlen)
13 mov r4,r0
14 and #3,r0
15 tst r0,r0
16 bt/s 1f
17 mov #0,r2
18
19 add #-1,r0
20 shll2 r0
21 shll r0
22 braf r0
23 nop
24
25 mov.b @r4+,r1
26 tst r1,r1
27 bt 8f
28 add #1,r2
29
30 mov.b @r4+,r1
31 tst r1,r1
32 bt 8f
33 add #1,r2
34
35 mov.b @r4+,r1
36 tst r1,r1
37 bt 8f
38 add #1,r2
39
401:
41 mov #0,r3
422:
43 mov.l @r4+,r1
44 cmp/str r3,r1
45 bf/s 2b
46 add #4,r2
47
48 add #-4,r2
49#ifndef __LITTLE_ENDIAN__
50 swap.b r1,r1
51 swap.w r1,r1
52 swap.b r1,r1
53#endif
54 extu.b r1,r0
55 tst r0,r0
56 bt/s 8f
57 shlr8 r1
58 add #1,r2
59 extu.b r1,r0
60 tst r0,r0
61 bt/s 8f
62 shlr8 r1
63 add #1,r2
64 extu.b r1,r0
65 tst r0,r0
66 bt 8f
67 add #1,r2
688:
69 rts
70 mov r2,r0
diff --git a/arch/sh/lib/udivdi3.c b/arch/sh/lib/udivdi3.c
new file mode 100644
index 000000000000..68f038bf3c50
--- /dev/null
+++ b/arch/sh/lib/udivdi3.c
@@ -0,0 +1,16 @@
1/*
2 * Simple __udivdi3 function which doesn't use FPU.
3 */
4
5#include <linux/types.h>
6
7extern u64 __xdiv64_32(u64 n, u32 d);
8extern void panic(const char * fmt, ...);
9
10u64 __udivdi3(u64 n, u64 d)
11{
12 if (d & ~0xffffffff)
13 panic("Need true 64-bit/64-bit division");
14 return __xdiv64_32(n, (u32)d);
15}
16