aboutsummaryrefslogtreecommitdiffstats
path: root/arch/sh/lib64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/sh/lib64')
-rw-r--r--arch/sh/lib64/Makefile8
-rw-r--r--arch/sh/lib64/c-checksum.c4
-rw-r--r--arch/sh/lib64/memcpy.S201
-rw-r--r--arch/sh/lib64/memcpy.c81
-rw-r--r--arch/sh/lib64/memset.S91
-rw-r--r--arch/sh/lib64/sdivsi3.S131
-rw-r--r--arch/sh/lib64/strcpy.S97
-rw-r--r--arch/sh/lib64/strlen.S33
-rw-r--r--arch/sh/lib64/udivdi3.S120
-rw-r--r--arch/sh/lib64/udivsi3.S59
10 files changed, 739 insertions, 86 deletions
diff --git a/arch/sh/lib64/Makefile b/arch/sh/lib64/Makefile
index 9950966923a0..4bacb9e83478 100644
--- a/arch/sh/lib64/Makefile
+++ b/arch/sh/lib64/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the SH-5 specific library files.. 2# Makefile for the SH-5 specific library files..
3# 3#
4# Copyright (C) 2000, 2001 Paolo Alberelli 4# Copyright (C) 2000, 2001 Paolo Alberelli
5# Copyright (C) 2003 Paul Mundt 5# Copyright (C) 2003 - 2008 Paul Mundt
6# 6#
7# This file is subject to the terms and conditions of the GNU General Public 7# This file is subject to the terms and conditions of the GNU General Public
8# License. See the file "COPYING" in the main directory of this archive 8# License. See the file "COPYING" in the main directory of this archive
@@ -10,6 +10,8 @@
10# 10#
11 11
12# Panic should really be compiled as PIC 12# Panic should really be compiled as PIC
13lib-y := udelay.o c-checksum.o dbg.o panic.o memcpy.o copy_user_memcpy.o \ 13lib-y := udelay.o c-checksum.o dbg.o panic.o memcpy.o memset.o \
14 copy_page.o clear_page.o 14 copy_user_memcpy.o copy_page.o clear_page.o strcpy.o strlen.o
15 15
16# Extracted from libgcc
17lib-y += udivsi3.o udivdi3.o sdivsi3.o
diff --git a/arch/sh/lib64/c-checksum.c b/arch/sh/lib64/c-checksum.c
index 5c284e0cff9c..73c0877e3a29 100644
--- a/arch/sh/lib64/c-checksum.c
+++ b/arch/sh/lib64/c-checksum.c
@@ -35,7 +35,7 @@ static inline unsigned short foldto16(unsigned long x)
35 35
36static inline unsigned short myfoldto16(unsigned long long x) 36static inline unsigned short myfoldto16(unsigned long long x)
37{ 37{
38 /* Fold down to 32-bits so we don't loose in the typedef-less 38 /* Fold down to 32-bits so we don't lose in the typedef-less
39 network stack. */ 39 network stack. */
40 /* 64 to 33 */ 40 /* 64 to 33 */
41 x = (x & 0xffffffff) + (x >> 32); 41 x = (x & 0xffffffff) + (x >> 32);
@@ -199,7 +199,7 @@ __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
199 result = (__force u64) saddr + (__force u64) daddr + 199 result = (__force u64) saddr + (__force u64) daddr +
200 (__force u64) sum + ((len + proto) << 8); 200 (__force u64) sum + ((len + proto) << 8);
201 201
202 /* Fold down to 32-bits so we don't loose in the typedef-less 202 /* Fold down to 32-bits so we don't lose in the typedef-less
203 network stack. */ 203 network stack. */
204 /* 64 to 33 */ 204 /* 64 to 33 */
205 result = (result & 0xffffffff) + (result >> 32); 205 result = (result & 0xffffffff) + (result >> 32);
diff --git a/arch/sh/lib64/memcpy.S b/arch/sh/lib64/memcpy.S
new file mode 100644
index 000000000000..dd300c372ce1
--- /dev/null
+++ b/arch/sh/lib64/memcpy.S
@@ -0,0 +1,201 @@
1/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
2/* Modified by SuperH, Inc. September 2003 */
3!
4! Fast SH memcpy
5!
6! by Toshiyasu Morita (tm@netcom.com)
7! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
8! SH5 code Copyright 2002 SuperH Ltd.
9!
10! Entry: ARG0: destination pointer
11! ARG1: source pointer
12! ARG2: byte count
13!
14! Exit: RESULT: destination pointer
15! any other registers in the range r0-r7: trashed
16!
17! Notes: Usually one wants to do small reads and write a longword, but
18! unfortunately it is difficult in some cases to concatanate bytes
19! into a longword on the SH, so this does a longword read and small
20! writes.
21!
22! This implementation makes two assumptions about how it is called:
23!
24! 1.: If the byte count is nonzero, the address of the last byte to be
25! copied is unsigned greater than the address of the first byte to
26! be copied. This could be easily swapped for a signed comparison,
27! but the algorithm used needs some comparison.
28!
29! 2.: When there are two or three bytes in the last word of an 11-or-more
30! bytes memory chunk to b copied, the rest of the word can be read
31! without side effects.
32! This could be easily changed by increasing the minumum size of
33! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
34! however, this would cost a few extra cyles on average.
35! For SHmedia, the assumption is that any quadword can be read in its
36! enirety if at least one byte is included in the copy.
37!
38
39 .section .text..SHmedia32,"ax"
40 .globl memcpy
41 .type memcpy, @function
42 .align 5
43
44memcpy:
45
46#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
47#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
48#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
49#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
50
51 ld.b r3,0,r63
52 pta/l Large,tr0
53 movi 25,r0
54 bgeu/u r4,r0,tr0
55 nsb r4,r0
56 shlli r0,5,r0
57 movi (L1-L0+63*32 + 1) & 0xffff,r1
58 sub r1, r0, r0
59L0: ptrel r0,tr0
60 add r2,r4,r5
61 ptabs r18,tr1
62 add r3,r4,r6
63 blink tr0,r63
64
65/* Rearranged to make cut2 safe */
66 .balign 8
67L4_7: /* 4..7 byte memcpy cntd. */
68 stlo.l r2, 0, r0
69 or r6, r7, r6
70 sthi.l r5, -1, r6
71 stlo.l r5, -4, r6
72 blink tr1,r63
73
74 .balign 8
75L1: /* 0 byte memcpy */
76 nop
77 blink tr1,r63
78 nop
79 nop
80 nop
81 nop
82
83L2_3: /* 2 or 3 byte memcpy cntd. */
84 st.b r5,-1,r6
85 blink tr1,r63
86
87 /* 1 byte memcpy */
88 ld.b r3,0,r0
89 st.b r2,0,r0
90 blink tr1,r63
91
92L8_15: /* 8..15 byte memcpy cntd. */
93 stlo.q r2, 0, r0
94 or r6, r7, r6
95 sthi.q r5, -1, r6
96 stlo.q r5, -8, r6
97 blink tr1,r63
98
99 /* 2 or 3 byte memcpy */
100 ld.b r3,0,r0
101 ld.b r2,0,r63
102 ld.b r3,1,r1
103 st.b r2,0,r0
104 pta/l L2_3,tr0
105 ld.b r6,-1,r6
106 st.b r2,1,r1
107 blink tr0, r63
108
109 /* 4 .. 7 byte memcpy */
110 LDUAL (r3, 0, r0, r1)
111 pta L4_7, tr0
112 ldlo.l r6, -4, r7
113 or r0, r1, r0
114 sthi.l r2, 3, r0
115 ldhi.l r6, -1, r6
116 blink tr0, r63
117
118 /* 8 .. 15 byte memcpy */
119 LDUAQ (r3, 0, r0, r1)
120 pta L8_15, tr0
121 ldlo.q r6, -8, r7
122 or r0, r1, r0
123 sthi.q r2, 7, r0
124 ldhi.q r6, -1, r6
125 blink tr0, r63
126
127 /* 16 .. 24 byte memcpy */
128 LDUAQ (r3, 0, r0, r1)
129 LDUAQ (r3, 8, r8, r9)
130 or r0, r1, r0
131 sthi.q r2, 7, r0
132 or r8, r9, r8
133 sthi.q r2, 15, r8
134 ldlo.q r6, -8, r7
135 ldhi.q r6, -1, r6
136 stlo.q r2, 8, r8
137 stlo.q r2, 0, r0
138 or r6, r7, r6
139 sthi.q r5, -1, r6
140 stlo.q r5, -8, r6
141 blink tr1,r63
142
143Large:
144 ld.b r2, 0, r63
145 pta/l Loop_ua, tr1
146 ori r3, -8, r7
147 sub r2, r7, r22
148 sub r3, r2, r6
149 add r2, r4, r5
150 ldlo.q r3, 0, r0
151 addi r5, -16, r5
152 movi 64+8, r27 // could subtract r7 from that.
153 stlo.q r2, 0, r0
154 sthi.q r2, 7, r0
155 ldx.q r22, r6, r0
156 bgtu/l r27, r4, tr1
157
158 addi r5, -48, r27
159 pta/l Loop_line, tr0
160 addi r6, 64, r36
161 addi r6, -24, r19
162 addi r6, -16, r20
163 addi r6, -8, r21
164
165Loop_line:
166 ldx.q r22, r36, r63
167 alloco r22, 32
168 addi r22, 32, r22
169 ldx.q r22, r19, r23
170 sthi.q r22, -25, r0
171 ldx.q r22, r20, r24
172 ldx.q r22, r21, r25
173 stlo.q r22, -32, r0
174 ldx.q r22, r6, r0
175 sthi.q r22, -17, r23
176 sthi.q r22, -9, r24
177 sthi.q r22, -1, r25
178 stlo.q r22, -24, r23
179 stlo.q r22, -16, r24
180 stlo.q r22, -8, r25
181 bgeu r27, r22, tr0
182
183Loop_ua:
184 addi r22, 8, r22
185 sthi.q r22, -1, r0
186 stlo.q r22, -8, r0
187 ldx.q r22, r6, r0
188 bgtu/l r5, r22, tr1
189
190 add r3, r4, r7
191 ldlo.q r7, -8, r1
192 sthi.q r22, 7, r0
193 ldhi.q r7, -1, r7
194 ptabs r18,tr1
195 stlo.q r22, 0, r0
196 or r1, r7, r1
197 sthi.q r5, 15, r1
198 stlo.q r5, 8, r1
199 blink tr1, r63
200
201 .size memcpy,.-memcpy
diff --git a/arch/sh/lib64/memcpy.c b/arch/sh/lib64/memcpy.c
deleted file mode 100644
index fba436a92bfa..000000000000
--- a/arch/sh/lib64/memcpy.c
+++ /dev/null
@@ -1,81 +0,0 @@
1/*
2 * Copyright (C) 2002 Mark Debbage (Mark.Debbage@superh.com)
3 *
4 * May be copied or modified under the terms of the GNU General Public
5 * License. See linux/COPYING for more information.
6 *
7 */
8
9#include <linux/types.h>
10#include <asm/string.h>
11
12// This is a simplistic optimization of memcpy to increase the
13// granularity of access beyond one byte using aligned
14// loads and stores. This is not an optimal implementation
15// for SH-5 (especially with regard to prefetching and the cache),
16// and a better version should be provided later ...
17
18void *memcpy(void *dest, const void *src, size_t count)
19{
20 char *d = (char *) dest, *s = (char *) src;
21
22 if (count >= 32) {
23 int i = 8 - (((unsigned long) d) & 0x7);
24
25 if (i != 8)
26 while (i-- && count--) {
27 *d++ = *s++;
28 }
29
30 if (((((unsigned long) d) & 0x7) == 0) &&
31 ((((unsigned long) s) & 0x7) == 0)) {
32 while (count >= 32) {
33 unsigned long long t1, t2, t3, t4;
34 t1 = *(unsigned long long *) (s);
35 t2 = *(unsigned long long *) (s + 8);
36 t3 = *(unsigned long long *) (s + 16);
37 t4 = *(unsigned long long *) (s + 24);
38 *(unsigned long long *) (d) = t1;
39 *(unsigned long long *) (d + 8) = t2;
40 *(unsigned long long *) (d + 16) = t3;
41 *(unsigned long long *) (d + 24) = t4;
42 d += 32;
43 s += 32;
44 count -= 32;
45 }
46 while (count >= 8) {
47 *(unsigned long long *) d =
48 *(unsigned long long *) s;
49 d += 8;
50 s += 8;
51 count -= 8;
52 }
53 }
54
55 if (((((unsigned long) d) & 0x3) == 0) &&
56 ((((unsigned long) s) & 0x3) == 0)) {
57 while (count >= 4) {
58 *(unsigned long *) d = *(unsigned long *) s;
59 d += 4;
60 s += 4;
61 count -= 4;
62 }
63 }
64
65 if (((((unsigned long) d) & 0x1) == 0) &&
66 ((((unsigned long) s) & 0x1) == 0)) {
67 while (count >= 2) {
68 *(unsigned short *) d = *(unsigned short *) s;
69 d += 2;
70 s += 2;
71 count -= 2;
72 }
73 }
74 }
75
76 while (count--) {
77 *d++ = *s++;
78 }
79
80 return d;
81}
diff --git a/arch/sh/lib64/memset.S b/arch/sh/lib64/memset.S
new file mode 100644
index 000000000000..2d37b0488552
--- /dev/null
+++ b/arch/sh/lib64/memset.S
@@ -0,0 +1,91 @@
1/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
2/* Modified by SuperH, Inc. September 2003 */
3!
4! Fast SH memset
5!
6! by Toshiyasu Morita (tm@netcom.com)
7!
8! SH5 code by J"orn Rennecke (joern.rennecke@superh.com)
9! Copyright 2002 SuperH Ltd.
10!
11
12#if __BYTE_ORDER == __LITTLE_ENDIAN
13#define SHHI shlld
14#define SHLO shlrd
15#else
16#define SHHI shlrd
17#define SHLO shlld
18#endif
19
20 .section .text..SHmedia32,"ax"
21 .globl memset
22 .type memset, @function
23
24 .align 5
25
26memset:
27 pta/l multiquad, tr0
28 andi r2, 7, r22
29 ptabs r18, tr2
30 mshflo.b r3,r3,r3
31 add r4, r22, r23
32 mperm.w r3, r63, r3 // Fill pattern now in every byte of r3
33
34 movi 8, r9
35 bgtu/u r23, r9, tr0 // multiquad
36
37 beqi/u r4, 0, tr2 // Return with size 0 - ensures no mem accesses
38 ldlo.q r2, 0, r7
39 shlli r4, 2, r4
40 movi -1, r8
41 SHHI r8, r4, r8
42 SHHI r8, r4, r8
43 mcmv r7, r8, r3
44 stlo.q r2, 0, r3
45 blink tr2, r63
46
47multiquad:
48 pta/l lastquad, tr0
49 stlo.q r2, 0, r3
50 shlri r23, 3, r24
51 add r2, r4, r5
52 beqi/u r24, 1, tr0 // lastquad
53 pta/l loop, tr1
54 sub r2, r22, r25
55 andi r5, -8, r20 // calculate end address and
56 addi r20, -7*8, r8 // loop end address; This might overflow, so we need
57 // to use a different test before we start the loop
58 bge/u r24, r9, tr1 // loop
59 st.q r25, 8, r3
60 st.q r20, -8, r3
61 shlri r24, 1, r24
62 beqi/u r24, 1, tr0 // lastquad
63 st.q r25, 16, r3
64 st.q r20, -16, r3
65 beqi/u r24, 2, tr0 // lastquad
66 st.q r25, 24, r3
67 st.q r20, -24, r3
68lastquad:
69 sthi.q r5, -1, r3
70 blink tr2,r63
71
72loop:
73!!! alloco r25, 32 // QQQ comment out for short-term fix to SHUK #3895.
74 // QQQ commenting out is locically correct, but sub-optimal
75 // QQQ Sean McGoogan - 4th April 2003.
76 st.q r25, 8, r3
77 st.q r25, 16, r3
78 st.q r25, 24, r3
79 st.q r25, 32, r3
80 addi r25, 32, r25
81 bgeu/l r8, r25, tr1 // loop
82
83 st.q r20, -40, r3
84 st.q r20, -32, r3
85 st.q r20, -24, r3
86 st.q r20, -16, r3
87 st.q r20, -8, r3
88 sthi.q r5, -1, r3
89 blink tr2,r63
90
91 .size memset,.-memset
diff --git a/arch/sh/lib64/sdivsi3.S b/arch/sh/lib64/sdivsi3.S
new file mode 100644
index 000000000000..6a800c6a4904
--- /dev/null
+++ b/arch/sh/lib64/sdivsi3.S
@@ -0,0 +1,131 @@
1 .global __sdivsi3
2 .section .text..SHmedia32,"ax"
3 .align 2
4
5 /* inputs: r4,r5 */
6 /* clobbered: r1,r18,r19,r20,r21,r25,tr0 */
7 /* result in r0 */
8__sdivsi3:
9 ptb __div_table,tr0
10
11 nsb r5, r1
12 shlld r5, r1, r25 /* normalize; [-2 ..1, 1..2) in s2.62 */
13 shari r25, 58, r21 /* extract 5(6) bit index (s2.4 with hole -1..1) */
14 /* bubble */
15 gettr tr0,r20
16 ldx.ub r20, r21, r19 /* u0.8 */
17 shari r25, 32, r25 /* normalize to s2.30 */
18 shlli r21, 1, r21
19 muls.l r25, r19, r19 /* s2.38 */
20 ldx.w r20, r21, r21 /* s2.14 */
21 ptabs r18, tr0
22 shari r19, 24, r19 /* truncate to s2.14 */
23 sub r21, r19, r19 /* some 11 bit inverse in s1.14 */
24 muls.l r19, r19, r21 /* u0.28 */
25 sub r63, r1, r1
26 addi r1, 92, r1
27 muls.l r25, r21, r18 /* s2.58 */
28 shlli r19, 45, r19 /* multiply by two and convert to s2.58 */
29 /* bubble */
30 sub r19, r18, r18
31 shari r18, 28, r18 /* some 22 bit inverse in s1.30 */
32 muls.l r18, r25, r0 /* s2.60 */
33 muls.l r18, r4, r25 /* s32.30 */
34 /* bubble */
35 shari r0, 16, r19 /* s-16.44 */
36 muls.l r19, r18, r19 /* s-16.74 */
37 shari r25, 63, r0
38 shari r4, 14, r18 /* s19.-14 */
39 shari r19, 30, r19 /* s-16.44 */
40 muls.l r19, r18, r19 /* s15.30 */
41 xor r21, r0, r21 /* You could also use the constant 1 << 27. */
42 add r21, r25, r21
43 sub r21, r19, r21
44 shard r21, r1, r21
45 sub r21, r0, r0
46 blink tr0, r63
47
48/* This table has been generated by divtab.c .
49Defects for bias -330:
50 Max defect: 6.081536e-07 at -1.000000e+00
51 Min defect: 2.849516e-08 at 1.030651e+00
52 Max 2nd step defect: 9.606539e-12 at -1.000000e+00
53 Min 2nd step defect: 0.000000e+00 at 0.000000e+00
54 Defect at 1: 1.238659e-07
55 Defect at -2: 1.061708e-07 */
56
57 .balign 2
58 .type __div_table,@object
59 .size __div_table,128
60/* negative division constants */
61 .word -16638
62 .word -17135
63 .word -17737
64 .word -18433
65 .word -19103
66 .word -19751
67 .word -20583
68 .word -21383
69 .word -22343
70 .word -23353
71 .word -24407
72 .word -25582
73 .word -26863
74 .word -28382
75 .word -29965
76 .word -31800
77/* negative division factors */
78 .byte 66
79 .byte 70
80 .byte 75
81 .byte 81
82 .byte 87
83 .byte 93
84 .byte 101
85 .byte 109
86 .byte 119
87 .byte 130
88 .byte 142
89 .byte 156
90 .byte 172
91 .byte 192
92 .byte 214
93 .byte 241
94 .skip 16
95 .global __div_table
96__div_table:
97 .skip 16
98/* positive division factors */
99 .byte 241
100 .byte 214
101 .byte 192
102 .byte 172
103 .byte 156
104 .byte 142
105 .byte 130
106 .byte 119
107 .byte 109
108 .byte 101
109 .byte 93
110 .byte 87
111 .byte 81
112 .byte 75
113 .byte 70
114 .byte 66
115/* positive division constants */
116 .word 31801
117 .word 29966
118 .word 28383
119 .word 26864
120 .word 25583
121 .word 24408
122 .word 23354
123 .word 22344
124 .word 21384
125 .word 20584
126 .word 19752
127 .word 19104
128 .word 18434
129 .word 17738
130 .word 17136
131 .word 16639
diff --git a/arch/sh/lib64/strcpy.S b/arch/sh/lib64/strcpy.S
new file mode 100644
index 000000000000..ea7c9c533eea
--- /dev/null
+++ b/arch/sh/lib64/strcpy.S
@@ -0,0 +1,97 @@
1/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
2/* Modified by SuperH, Inc. September 2003 */
3! Entry: arg0: destination
4! arg1: source
5! Exit: result: destination
6!
7! SH5 code Copyright 2002 SuperH Ltd.
8
9#if __BYTE_ORDER == __LITTLE_ENDIAN
10#define SHHI shlld
11#define SHLO shlrd
12#else
13#define SHHI shlrd
14#define SHLO shlld
15#endif
16
17 .section .text..SHmedia32,"ax"
18 .globl strcpy
19 .type strcpy, @function
20 .align 5
21
22strcpy:
23
24 pta/l shortstring,tr1
25 ldlo.q r3,0,r4
26 ptabs r18,tr4
27 shlli r3,3,r7
28 addi r2, 8, r0
29 mcmpeq.b r4,r63,r6
30 SHHI r6,r7,r6
31 bnei/u r6,0,tr1 // shortstring
32 pta/l no_lddst, tr2
33 ori r3,-8,r23
34 sub r2, r23, r0
35 sub r3, r2, r21
36 addi r21, 8, r20
37 ldx.q r0, r21, r5
38 pta/l loop, tr0
39 ori r2,-8,r22
40 mcmpeq.b r5, r63, r6
41 bgt/u r22, r23, tr2 // no_lddst
42
43 // r22 < r23 : Need to do a load from the destination.
44 // r22 == r23 : Doesn't actually need to load from destination,
45 // but still can be handled here.
46 ldlo.q r2, 0, r9
47 movi -1, r8
48 SHLO r8, r7, r8
49 mcmv r4, r8, r9
50 stlo.q r2, 0, r9
51 beqi/l r6, 0, tr0 // loop
52
53 add r5, r63, r4
54 addi r0, 8, r0
55 blink tr1, r63 // shortstring
56no_lddst:
57 // r22 > r23: note that for r22 == r23 the sthi.q would clobber
58 // bytes before the destination region.
59 stlo.q r2, 0, r4
60 SHHI r4, r7, r4
61 sthi.q r0, -1, r4
62 beqi/l r6, 0, tr0 // loop
63
64 add r5, r63, r4
65 addi r0, 8, r0
66shortstring:
67#if __BYTE_ORDER != __LITTLE_ENDIAN
68 pta/l shortstring2,tr1
69 byterev r4,r4
70#endif
71shortstring2:
72 st.b r0,-8,r4
73 andi r4,0xff,r5
74 shlri r4,8,r4
75 addi r0,1,r0
76 bnei/l r5,0,tr1
77 blink tr4,r63 // return
78
79 .balign 8
80loop:
81 stlo.q r0, 0, r5
82 ldx.q r0, r20, r4
83 addi r0, 16, r0
84 sthi.q r0, -9, r5
85 mcmpeq.b r4, r63, r6
86 bnei/u r6, 0, tr1 // shortstring
87 ldx.q r0, r21, r5
88 stlo.q r0, -8, r4
89 sthi.q r0, -1, r4
90 mcmpeq.b r5, r63, r6
91 beqi/l r6, 0, tr0 // loop
92
93 add r5, r63, r4
94 addi r0, 8, r0
95 blink tr1, r63 // shortstring
96
97 .size strcpy,.-strcpy
diff --git a/arch/sh/lib64/strlen.S b/arch/sh/lib64/strlen.S
new file mode 100644
index 000000000000..cbc0d912e5f3
--- /dev/null
+++ b/arch/sh/lib64/strlen.S
@@ -0,0 +1,33 @@
1/*
2 * Simplistic strlen() implementation for SHmedia.
3 *
4 * Copyright (C) 2003 Paul Mundt <lethal@linux-sh.org>
5 */
6
7 .section .text..SHmedia32,"ax"
8 .globl strlen
9 .type strlen,@function
10
11 .balign 16
12strlen:
13 ptabs r18, tr4
14
15 /*
16 * Note: We could easily deal with the NULL case here with a simple
17 * sanity check, though it seems that the behavior we want is to fault
18 * in the event that r2 == NULL, so we don't bother.
19 */
20/* beqi r2, 0, tr4 */ ! Sanity check
21
22 movi -1, r0
23 pta/l loop, tr0
24loop:
25 ld.b r2, 0, r1
26 addi r2, 1, r2
27 addi r0, 1, r0
28 bnei/l r1, 0, tr0
29
30 or r0, r63, r2
31 blink tr4, r63
32
33 .size strlen,.-strlen
diff --git a/arch/sh/lib64/udivdi3.S b/arch/sh/lib64/udivdi3.S
new file mode 100644
index 000000000000..6895c0225b85
--- /dev/null
+++ b/arch/sh/lib64/udivdi3.S
@@ -0,0 +1,120 @@
1 .section .text..SHmedia32,"ax"
2 .align 2
3 .global __udivdi3
4__udivdi3:
5 shlri r3,1,r4
6 nsb r4,r22
7 shlld r3,r22,r6
8 shlri r6,49,r5
9 movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */
10 sub r21,r5,r1
11 mmulfx.w r1,r1,r4
12 mshflo.w r1,r63,r1
13 sub r63,r22,r20 // r63 == 64 % 64
14 mmulfx.w r5,r4,r4
15 pta large_divisor,tr0
16 addi r20,32,r9
17 msub.w r1,r4,r1
18 madd.w r1,r1,r1
19 mmulfx.w r1,r1,r4
20 shlri r6,32,r7
21 bgt/u r9,r63,tr0 // large_divisor
22 mmulfx.w r5,r4,r4
23 shlri r2,32+14,r19
24 addi r22,-31,r0
25 msub.w r1,r4,r1
26
27 mulu.l r1,r7,r4
28 addi r1,-3,r5
29 mulu.l r5,r19,r5
30 sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
31 shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
32 the case may be, %0000000000000000 000.11111111111, still */
33 muls.l r1,r4,r4 /* leaving at least one sign bit. */
34 mulu.l r5,r3,r8
35 mshalds.l r1,r21,r1
36 shari r4,26,r4
37 shlld r8,r0,r8
38 add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
39 sub r2,r8,r2
40 /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */
41
42 shlri r2,22,r21
43 mulu.l r21,r1,r21
44 shlld r5,r0,r8
45 addi r20,30-22,r0
46 shlrd r21,r0,r21
47 mulu.l r21,r3,r5
48 add r8,r21,r8
49 mcmpgt.l r21,r63,r21 // See Note 1
50 addi r20,30,r0
51 mshfhi.l r63,r21,r21
52 sub r2,r5,r2
53 andc r2,r21,r2
54
55 /* small divisor: need a third divide step */
56 mulu.l r2,r1,r7
57 ptabs r18,tr0
58 addi r2,1,r2
59 shlrd r7,r0,r7
60 mulu.l r7,r3,r5
61 add r8,r7,r8
62 sub r2,r3,r2
63 cmpgt r2,r5,r5
64 add r8,r5,r2
65 /* could test r3 here to check for divide by zero. */
66 blink tr0,r63
67
68large_divisor:
69 mmulfx.w r5,r4,r4
70 shlrd r2,r9,r25
71 shlri r25,32,r8
72 msub.w r1,r4,r1
73
74 mulu.l r1,r7,r4
75 addi r1,-3,r5
76 mulu.l r5,r8,r5
77 sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
78 shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
79 the case may be, %0000000000000000 000.11111111111, still */
80 muls.l r1,r4,r4 /* leaving at least one sign bit. */
81 shlri r5,14-1,r8
82 mulu.l r8,r7,r5
83 mshalds.l r1,r21,r1
84 shari r4,26,r4
85 add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
86 sub r25,r5,r25
87 /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */
88
89 shlri r25,22,r21
90 mulu.l r21,r1,r21
91 pta no_lo_adj,tr0
92 addi r22,32,r0
93 shlri r21,40,r21
94 mulu.l r21,r7,r5
95 add r8,r21,r8
96 shlld r2,r0,r2
97 sub r25,r5,r25
98 bgtu/u r7,r25,tr0 // no_lo_adj
99 addi r8,1,r8
100 sub r25,r7,r25
101no_lo_adj:
102 mextr4 r2,r25,r2
103
104 /* large_divisor: only needs a few adjustments. */
105 mulu.l r8,r6,r5
106 ptabs r18,tr0
107 /* bubble */
108 cmpgtu r5,r2,r5
109 sub r8,r5,r2
110 blink tr0,r63
111
112/* Note 1: To shift the result of the second divide stage so that the result
113 always fits into 32 bits, yet we still reduce the rest sufficiently
114 would require a lot of instructions to do the shifts just right. Using
115 the full 64 bit shift result to multiply with the divisor would require
116 four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
117 Fortunately, if the upper 32 bits of the shift result are nonzero, we
118 know that the rest after taking this partial result into account will
119 fit into 32 bits. So we just clear the upper 32 bits of the rest if the
120 upper 32 bits of the partial result are nonzero. */
diff --git a/arch/sh/lib64/udivsi3.S b/arch/sh/lib64/udivsi3.S
new file mode 100644
index 000000000000..e68120e4b847
--- /dev/null
+++ b/arch/sh/lib64/udivsi3.S
@@ -0,0 +1,59 @@
1 .global __udivsi3
2 .section .text..SHmedia32,"ax"
3 .align 2
4
5/*
6 inputs: r4,r5
7 clobbered: r18,r19,r20,r21,r22,r25,tr0
8 result in r0.
9 */
10__udivsi3:
11 addz.l r5,r63,r22
12 nsb r22,r0
13 shlld r22,r0,r25
14 shlri r25,48,r25
15 movi 0xffffffffffffbb0c,r20 /* shift count eqiv 76 */
16 sub r20,r25,r21
17 mmulfx.w r21,r21,r19
18 mshflo.w r21,r63,r21
19 ptabs r18,tr0
20 mmulfx.w r25,r19,r19
21 sub r20,r0,r0
22 /* bubble */
23 msub.w r21,r19,r19
24
25 /*
26 * It would be nice for scheduling to do this add to r21 before
27 * the msub.w, but we need a different value for r19 to keep
28 * errors under control.
29 */
30 addi r19,-2,r21
31 mulu.l r4,r21,r18
32 mmulfx.w r19,r19,r19
33 shlli r21,15,r21
34 shlrd r18,r0,r18
35 mulu.l r18,r22,r20
36 mmacnfx.wl r25,r19,r21
37 /* bubble */
38 sub r4,r20,r25
39
40 mulu.l r25,r21,r19
41 addi r0,14,r0
42 /* bubble */
43 shlrd r19,r0,r19
44 mulu.l r19,r22,r20
45 add r18,r19,r18
46 /* bubble */
47 sub.l r25,r20,r25
48
49 mulu.l r25,r21,r19
50 addz.l r25,r63,r25
51 sub r25,r22,r25
52 shlrd r19,r0,r19
53 mulu.l r19,r22,r20
54 addi r25,1,r25
55 add r18,r19,r18
56
57 cmpgt r25,r20,r25
58 add.l r18,r25,r0
59 blink tr0,r63