aboutsummaryrefslogtreecommitdiffstats
path: root/arch/sh/lib64
diff options
context:
space:
mode:
authorPaul Mundt <lethal@linux-sh.org>2008-12-12 02:34:44 -0500
committerPaul Mundt <lethal@linux-sh.org>2008-12-22 04:44:05 -0500
commit4466b20cfcfa718ff515b9e3886749cc025e2005 (patch)
treef12402e006de74df252dff03d455af0a9e2bfa0b /arch/sh/lib64
parent776d6c298aad42c2b8f191fa9ad826075e4d588c (diff)
sh: Add SH-5 optimized memcpy()/memset()/strcpy()/strlen().
Adopted from the uClibc optimized string versions. Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Diffstat (limited to 'arch/sh/lib64')
-rw-r--r--arch/sh/lib64/Makefile7
-rw-r--r--arch/sh/lib64/memcpy.S201
-rw-r--r--arch/sh/lib64/memcpy.c81
-rw-r--r--arch/sh/lib64/memset.S91
-rw-r--r--arch/sh/lib64/strcpy.S97
-rw-r--r--arch/sh/lib64/strlen.S33
6 files changed, 425 insertions, 85 deletions
diff --git a/arch/sh/lib64/Makefile b/arch/sh/lib64/Makefile
index 9950966923a0..1d932e7d0ca0 100644
--- a/arch/sh/lib64/Makefile
+++ b/arch/sh/lib64/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the SH-5 specific library files.. 2# Makefile for the SH-5 specific library files..
3# 3#
4# Copyright (C) 2000, 2001 Paolo Alberelli 4# Copyright (C) 2000, 2001 Paolo Alberelli
5# Copyright (C) 2003 Paul Mundt 5# Copyright (C) 2003 - 2008 Paul Mundt
6# 6#
7# This file is subject to the terms and conditions of the GNU General Public 7# This file is subject to the terms and conditions of the GNU General Public
8# License. See the file "COPYING" in the main directory of this archive 8# License. See the file "COPYING" in the main directory of this archive
@@ -10,6 +10,5 @@
10# 10#
11 11
12# Panic should really be compiled as PIC 12# Panic should really be compiled as PIC
13lib-y := udelay.o c-checksum.o dbg.o panic.o memcpy.o copy_user_memcpy.o \ 13lib-y := udelay.o c-checksum.o dbg.o panic.o memcpy.o memset.o \
14 copy_page.o clear_page.o 14 copy_user_memcpy.o copy_page.o clear_page.o strcpy.o strlen.o
15
diff --git a/arch/sh/lib64/memcpy.S b/arch/sh/lib64/memcpy.S
new file mode 100644
index 000000000000..dd300c372ce1
--- /dev/null
+++ b/arch/sh/lib64/memcpy.S
@@ -0,0 +1,201 @@
1/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
2/* Modified by SuperH, Inc. September 2003 */
3!
4! Fast SH memcpy
5!
6! by Toshiyasu Morita (tm@netcom.com)
7! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
8! SH5 code Copyright 2002 SuperH Ltd.
9!
10! Entry: ARG0: destination pointer
11! ARG1: source pointer
12! ARG2: byte count
13!
14! Exit: RESULT: destination pointer
15! any other registers in the range r0-r7: trashed
16!
17! Notes: Usually one wants to do small reads and write a longword, but
18! unfortunately it is difficult in some cases to concatanate bytes
19! into a longword on the SH, so this does a longword read and small
20! writes.
21!
22! This implementation makes two assumptions about how it is called:
23!
24! 1.: If the byte count is nonzero, the address of the last byte to be
25! copied is unsigned greater than the address of the first byte to
26! be copied. This could be easily swapped for a signed comparison,
27! but the algorithm used needs some comparison.
28!
29! 2.: When there are two or three bytes in the last word of an 11-or-more
30! bytes memory chunk to b copied, the rest of the word can be read
31! without side effects.
32! This could be easily changed by increasing the minumum size of
33! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
34! however, this would cost a few extra cyles on average.
35! For SHmedia, the assumption is that any quadword can be read in its
36! enirety if at least one byte is included in the copy.
37!
38
39 .section .text..SHmedia32,"ax"
40 .globl memcpy
41 .type memcpy, @function
42 .align 5
43
44memcpy:
45
46#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
47#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
48#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
49#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
50
51 ld.b r3,0,r63
52 pta/l Large,tr0
53 movi 25,r0
54 bgeu/u r4,r0,tr0
55 nsb r4,r0
56 shlli r0,5,r0
57 movi (L1-L0+63*32 + 1) & 0xffff,r1
58 sub r1, r0, r0
59L0: ptrel r0,tr0
60 add r2,r4,r5
61 ptabs r18,tr1
62 add r3,r4,r6
63 blink tr0,r63
64
65/* Rearranged to make cut2 safe */
66 .balign 8
67L4_7: /* 4..7 byte memcpy cntd. */
68 stlo.l r2, 0, r0
69 or r6, r7, r6
70 sthi.l r5, -1, r6
71 stlo.l r5, -4, r6
72 blink tr1,r63
73
74 .balign 8
75L1: /* 0 byte memcpy */
76 nop
77 blink tr1,r63
78 nop
79 nop
80 nop
81 nop
82
83L2_3: /* 2 or 3 byte memcpy cntd. */
84 st.b r5,-1,r6
85 blink tr1,r63
86
87 /* 1 byte memcpy */
88 ld.b r3,0,r0
89 st.b r2,0,r0
90 blink tr1,r63
91
92L8_15: /* 8..15 byte memcpy cntd. */
93 stlo.q r2, 0, r0
94 or r6, r7, r6
95 sthi.q r5, -1, r6
96 stlo.q r5, -8, r6
97 blink tr1,r63
98
99 /* 2 or 3 byte memcpy */
100 ld.b r3,0,r0
101 ld.b r2,0,r63
102 ld.b r3,1,r1
103 st.b r2,0,r0
104 pta/l L2_3,tr0
105 ld.b r6,-1,r6
106 st.b r2,1,r1
107 blink tr0, r63
108
109 /* 4 .. 7 byte memcpy */
110 LDUAL (r3, 0, r0, r1)
111 pta L4_7, tr0
112 ldlo.l r6, -4, r7
113 or r0, r1, r0
114 sthi.l r2, 3, r0
115 ldhi.l r6, -1, r6
116 blink tr0, r63
117
118 /* 8 .. 15 byte memcpy */
119 LDUAQ (r3, 0, r0, r1)
120 pta L8_15, tr0
121 ldlo.q r6, -8, r7
122 or r0, r1, r0
123 sthi.q r2, 7, r0
124 ldhi.q r6, -1, r6
125 blink tr0, r63
126
127 /* 16 .. 24 byte memcpy */
128 LDUAQ (r3, 0, r0, r1)
129 LDUAQ (r3, 8, r8, r9)
130 or r0, r1, r0
131 sthi.q r2, 7, r0
132 or r8, r9, r8
133 sthi.q r2, 15, r8
134 ldlo.q r6, -8, r7
135 ldhi.q r6, -1, r6
136 stlo.q r2, 8, r8
137 stlo.q r2, 0, r0
138 or r6, r7, r6
139 sthi.q r5, -1, r6
140 stlo.q r5, -8, r6
141 blink tr1,r63
142
143Large:
144 ld.b r2, 0, r63
145 pta/l Loop_ua, tr1
146 ori r3, -8, r7
147 sub r2, r7, r22
148 sub r3, r2, r6
149 add r2, r4, r5
150 ldlo.q r3, 0, r0
151 addi r5, -16, r5
152 movi 64+8, r27 // could subtract r7 from that.
153 stlo.q r2, 0, r0
154 sthi.q r2, 7, r0
155 ldx.q r22, r6, r0
156 bgtu/l r27, r4, tr1
157
158 addi r5, -48, r27
159 pta/l Loop_line, tr0
160 addi r6, 64, r36
161 addi r6, -24, r19
162 addi r6, -16, r20
163 addi r6, -8, r21
164
165Loop_line:
166 ldx.q r22, r36, r63
167 alloco r22, 32
168 addi r22, 32, r22
169 ldx.q r22, r19, r23
170 sthi.q r22, -25, r0
171 ldx.q r22, r20, r24
172 ldx.q r22, r21, r25
173 stlo.q r22, -32, r0
174 ldx.q r22, r6, r0
175 sthi.q r22, -17, r23
176 sthi.q r22, -9, r24
177 sthi.q r22, -1, r25
178 stlo.q r22, -24, r23
179 stlo.q r22, -16, r24
180 stlo.q r22, -8, r25
181 bgeu r27, r22, tr0
182
183Loop_ua:
184 addi r22, 8, r22
185 sthi.q r22, -1, r0
186 stlo.q r22, -8, r0
187 ldx.q r22, r6, r0
188 bgtu/l r5, r22, tr1
189
190 add r3, r4, r7
191 ldlo.q r7, -8, r1
192 sthi.q r22, 7, r0
193 ldhi.q r7, -1, r7
194 ptabs r18,tr1
195 stlo.q r22, 0, r0
196 or r1, r7, r1
197 sthi.q r5, 15, r1
198 stlo.q r5, 8, r1
199 blink tr1, r63
200
201 .size memcpy,.-memcpy
diff --git a/arch/sh/lib64/memcpy.c b/arch/sh/lib64/memcpy.c
deleted file mode 100644
index fba436a92bfa..000000000000
--- a/arch/sh/lib64/memcpy.c
+++ /dev/null
@@ -1,81 +0,0 @@
1/*
2 * Copyright (C) 2002 Mark Debbage (Mark.Debbage@superh.com)
3 *
4 * May be copied or modified under the terms of the GNU General Public
5 * License. See linux/COPYING for more information.
6 *
7 */
8
9#include <linux/types.h>
10#include <asm/string.h>
11
12// This is a simplistic optimization of memcpy to increase the
13// granularity of access beyond one byte using aligned
14// loads and stores. This is not an optimal implementation
15// for SH-5 (especially with regard to prefetching and the cache),
16// and a better version should be provided later ...
17
18void *memcpy(void *dest, const void *src, size_t count)
19{
20 char *d = (char *) dest, *s = (char *) src;
21
22 if (count >= 32) {
23 int i = 8 - (((unsigned long) d) & 0x7);
24
25 if (i != 8)
26 while (i-- && count--) {
27 *d++ = *s++;
28 }
29
30 if (((((unsigned long) d) & 0x7) == 0) &&
31 ((((unsigned long) s) & 0x7) == 0)) {
32 while (count >= 32) {
33 unsigned long long t1, t2, t3, t4;
34 t1 = *(unsigned long long *) (s);
35 t2 = *(unsigned long long *) (s + 8);
36 t3 = *(unsigned long long *) (s + 16);
37 t4 = *(unsigned long long *) (s + 24);
38 *(unsigned long long *) (d) = t1;
39 *(unsigned long long *) (d + 8) = t2;
40 *(unsigned long long *) (d + 16) = t3;
41 *(unsigned long long *) (d + 24) = t4;
42 d += 32;
43 s += 32;
44 count -= 32;
45 }
46 while (count >= 8) {
47 *(unsigned long long *) d =
48 *(unsigned long long *) s;
49 d += 8;
50 s += 8;
51 count -= 8;
52 }
53 }
54
55 if (((((unsigned long) d) & 0x3) == 0) &&
56 ((((unsigned long) s) & 0x3) == 0)) {
57 while (count >= 4) {
58 *(unsigned long *) d = *(unsigned long *) s;
59 d += 4;
60 s += 4;
61 count -= 4;
62 }
63 }
64
65 if (((((unsigned long) d) & 0x1) == 0) &&
66 ((((unsigned long) s) & 0x1) == 0)) {
67 while (count >= 2) {
68 *(unsigned short *) d = *(unsigned short *) s;
69 d += 2;
70 s += 2;
71 count -= 2;
72 }
73 }
74 }
75
76 while (count--) {
77 *d++ = *s++;
78 }
79
80 return d;
81}
diff --git a/arch/sh/lib64/memset.S b/arch/sh/lib64/memset.S
new file mode 100644
index 000000000000..2d37b0488552
--- /dev/null
+++ b/arch/sh/lib64/memset.S
@@ -0,0 +1,91 @@
1/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
2/* Modified by SuperH, Inc. September 2003 */
3!
4! Fast SH memset
5!
6! by Toshiyasu Morita (tm@netcom.com)
7!
8! SH5 code by J"orn Rennecke (joern.rennecke@superh.com)
9! Copyright 2002 SuperH Ltd.
10!
11
12#if __BYTE_ORDER == __LITTLE_ENDIAN
13#define SHHI shlld
14#define SHLO shlrd
15#else
16#define SHHI shlrd
17#define SHLO shlld
18#endif
19
20 .section .text..SHmedia32,"ax"
21 .globl memset
22 .type memset, @function
23
24 .align 5
25
26memset:
27 pta/l multiquad, tr0
28 andi r2, 7, r22
29 ptabs r18, tr2
30 mshflo.b r3,r3,r3
31 add r4, r22, r23
32 mperm.w r3, r63, r3 // Fill pattern now in every byte of r3
33
34 movi 8, r9
35 bgtu/u r23, r9, tr0 // multiquad
36
37 beqi/u r4, 0, tr2 // Return with size 0 - ensures no mem accesses
38 ldlo.q r2, 0, r7
39 shlli r4, 2, r4
40 movi -1, r8
41 SHHI r8, r4, r8
42 SHHI r8, r4, r8
43 mcmv r7, r8, r3
44 stlo.q r2, 0, r3
45 blink tr2, r63
46
47multiquad:
48 pta/l lastquad, tr0
49 stlo.q r2, 0, r3
50 shlri r23, 3, r24
51 add r2, r4, r5
52 beqi/u r24, 1, tr0 // lastquad
53 pta/l loop, tr1
54 sub r2, r22, r25
55 andi r5, -8, r20 // calculate end address and
56 addi r20, -7*8, r8 // loop end address; This might overflow, so we need
57 // to use a different test before we start the loop
58 bge/u r24, r9, tr1 // loop
59 st.q r25, 8, r3
60 st.q r20, -8, r3
61 shlri r24, 1, r24
62 beqi/u r24, 1, tr0 // lastquad
63 st.q r25, 16, r3
64 st.q r20, -16, r3
65 beqi/u r24, 2, tr0 // lastquad
66 st.q r25, 24, r3
67 st.q r20, -24, r3
68lastquad:
69 sthi.q r5, -1, r3
70 blink tr2,r63
71
72loop:
73!!! alloco r25, 32 // QQQ comment out for short-term fix to SHUK #3895.
74 // QQQ commenting out is locically correct, but sub-optimal
75 // QQQ Sean McGoogan - 4th April 2003.
76 st.q r25, 8, r3
77 st.q r25, 16, r3
78 st.q r25, 24, r3
79 st.q r25, 32, r3
80 addi r25, 32, r25
81 bgeu/l r8, r25, tr1 // loop
82
83 st.q r20, -40, r3
84 st.q r20, -32, r3
85 st.q r20, -24, r3
86 st.q r20, -16, r3
87 st.q r20, -8, r3
88 sthi.q r5, -1, r3
89 blink tr2,r63
90
91 .size memset,.-memset
diff --git a/arch/sh/lib64/strcpy.S b/arch/sh/lib64/strcpy.S
new file mode 100644
index 000000000000..ea7c9c533eea
--- /dev/null
+++ b/arch/sh/lib64/strcpy.S
@@ -0,0 +1,97 @@
1/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
2/* Modified by SuperH, Inc. September 2003 */
3! Entry: arg0: destination
4! arg1: source
5! Exit: result: destination
6!
7! SH5 code Copyright 2002 SuperH Ltd.
8
9#if __BYTE_ORDER == __LITTLE_ENDIAN
10#define SHHI shlld
11#define SHLO shlrd
12#else
13#define SHHI shlrd
14#define SHLO shlld
15#endif
16
17 .section .text..SHmedia32,"ax"
18 .globl strcpy
19 .type strcpy, @function
20 .align 5
21
22strcpy:
23
24 pta/l shortstring,tr1
25 ldlo.q r3,0,r4
26 ptabs r18,tr4
27 shlli r3,3,r7
28 addi r2, 8, r0
29 mcmpeq.b r4,r63,r6
30 SHHI r6,r7,r6
31 bnei/u r6,0,tr1 // shortstring
32 pta/l no_lddst, tr2
33 ori r3,-8,r23
34 sub r2, r23, r0
35 sub r3, r2, r21
36 addi r21, 8, r20
37 ldx.q r0, r21, r5
38 pta/l loop, tr0
39 ori r2,-8,r22
40 mcmpeq.b r5, r63, r6
41 bgt/u r22, r23, tr2 // no_lddst
42
43 // r22 < r23 : Need to do a load from the destination.
44 // r22 == r23 : Doesn't actually need to load from destination,
45 // but still can be handled here.
46 ldlo.q r2, 0, r9
47 movi -1, r8
48 SHLO r8, r7, r8
49 mcmv r4, r8, r9
50 stlo.q r2, 0, r9
51 beqi/l r6, 0, tr0 // loop
52
53 add r5, r63, r4
54 addi r0, 8, r0
55 blink tr1, r63 // shortstring
56no_lddst:
57 // r22 > r23: note that for r22 == r23 the sthi.q would clobber
58 // bytes before the destination region.
59 stlo.q r2, 0, r4
60 SHHI r4, r7, r4
61 sthi.q r0, -1, r4
62 beqi/l r6, 0, tr0 // loop
63
64 add r5, r63, r4
65 addi r0, 8, r0
66shortstring:
67#if __BYTE_ORDER != __LITTLE_ENDIAN
68 pta/l shortstring2,tr1
69 byterev r4,r4
70#endif
71shortstring2:
72 st.b r0,-8,r4
73 andi r4,0xff,r5
74 shlri r4,8,r4
75 addi r0,1,r0
76 bnei/l r5,0,tr1
77 blink tr4,r63 // return
78
79 .balign 8
80loop:
81 stlo.q r0, 0, r5
82 ldx.q r0, r20, r4
83 addi r0, 16, r0
84 sthi.q r0, -9, r5
85 mcmpeq.b r4, r63, r6
86 bnei/u r6, 0, tr1 // shortstring
87 ldx.q r0, r21, r5
88 stlo.q r0, -8, r4
89 sthi.q r0, -1, r4
90 mcmpeq.b r5, r63, r6
91 beqi/l r6, 0, tr0 // loop
92
93 add r5, r63, r4
94 addi r0, 8, r0
95 blink tr1, r63 // shortstring
96
97 .size strcpy,.-strcpy
diff --git a/arch/sh/lib64/strlen.S b/arch/sh/lib64/strlen.S
new file mode 100644
index 000000000000..cbc0d912e5f3
--- /dev/null
+++ b/arch/sh/lib64/strlen.S
@@ -0,0 +1,33 @@
1/*
2 * Simplistic strlen() implementation for SHmedia.
3 *
4 * Copyright (C) 2003 Paul Mundt <lethal@linux-sh.org>
5 */
6
7 .section .text..SHmedia32,"ax"
8 .globl strlen
9 .type strlen,@function
10
11 .balign 16
12strlen:
13 ptabs r18, tr4
14
15 /*
16 * Note: We could easily deal with the NULL case here with a simple
17 * sanity check, though it seems that the behavior we want is to fault
18 * in the event that r2 == NULL, so we don't bother.
19 */
20/* beqi r2, 0, tr4 */ ! Sanity check
21
22 movi -1, r0
23 pta/l loop, tr0
24loop:
25 ld.b r2, 0, r1
26 addi r2, 1, r2
27 addi r0, 1, r0
28 bnei/l r1, 0, tr0
29
30 or r0, r63, r2
31 blink tr4, r63
32
33 .size strlen,.-strlen