diff options
Diffstat (limited to 'arch/cris/arch-v32')
-rw-r--r-- | arch/cris/arch-v32/lib/string.c | 325 |
1 files changed, 171 insertions, 154 deletions
diff --git a/arch/cris/arch-v32/lib/string.c b/arch/cris/arch-v32/lib/string.c index 6740b2cebae5..c7bd6ebdc93c 100644 --- a/arch/cris/arch-v32/lib/string.c +++ b/arch/cris/arch-v32/lib/string.c | |||
@@ -1,55 +1,59 @@ | |||
1 | /*#************************************************************************#*/ | 1 | /* A memcpy for CRIS. |
2 | /*#-------------------------------------------------------------------------*/ | 2 | Copyright (C) 1994-2005 Axis Communications. |
3 | /*# */ | 3 | All rights reserved. |
4 | /*# FUNCTION NAME: memcpy() */ | 4 | |
5 | /*# */ | 5 | Redistribution and use in source and binary forms, with or without |
6 | /*# PARAMETERS: void* dst; Destination address. */ | 6 | modification, are permitted provided that the following conditions |
7 | /*# void* src; Source address. */ | 7 | are met: |
8 | /*# int len; Number of bytes to copy. */ | 8 | |
9 | /*# */ | 9 | 1. Redistributions of source code must retain the above copyright |
10 | /*# RETURNS: dst. */ | 10 | notice, this list of conditions and the following disclaimer. |
11 | /*# */ | 11 | |
12 | /*# DESCRIPTION: Copies len bytes of memory from src to dst. No guarantees */ | 12 | 2. Neither the name of Axis Communications nor the names of its |
13 | /*# about copying of overlapping memory areas. This routine is */ | 13 | contributors may be used to endorse or promote products derived |
14 | /*# very sensitive to compiler changes in register allocation. */ | 14 | from this software without specific prior written permission. |
15 | /*# Should really be rewritten to avoid this problem. */ | 15 | |
16 | /*# */ | 16 | THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS |
17 | /*#-------------------------------------------------------------------------*/ | 17 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
18 | /*# */ | 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
19 | /*# HISTORY */ | 19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS |
20 | /*# */ | 20 | COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, |
21 | /*# DATE NAME CHANGES */ | 21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
22 | /*# ---- ---- ------- */ | 22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
23 | /*# 941007 Kenny R Creation */ | 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
24 | /*# 941011 Kenny R Lots of optimizations and inlining. */ | 24 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
25 | /*# 941129 Ulf A Adapted for use in libc. */ | 25 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING |
26 | /*# 950216 HP N==0 forgotten if non-aligned src/dst. */ | 26 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
27 | /*# Added some optimizations. */ | 27 | POSSIBILITY OF SUCH DAMAGE. */ |
28 | /*# 001025 HP Make src and dst char *. Align dst to */ | 28 | |
29 | /*# dword, not just word-if-both-src-and-dst- */ | 29 | /* FIXME: This file should really only be used for reference, as the |
30 | /*# are-misaligned. */ | 30 | result is somewhat depending on gcc generating what we expect rather |
31 | /*# */ | 31 | than what we describe. An assembly file should be used instead. */ |
32 | /*#-------------------------------------------------------------------------*/ | 32 | |
33 | 33 | #include <stddef.h> | |
34 | #include <linux/types.h> | 34 | |
35 | 35 | /* Break even between movem and move16 is really at 38.7 * 2, but | |
36 | void *memcpy(void *pdst, | 36 | modulo 44, so up to the next multiple of 44, we use ordinary code. */ |
37 | const void *psrc, | 37 | #define MEMCPY_BY_BLOCK_THRESHOLD (44 * 2) |
38 | size_t pn) | 38 | |
39 | /* No name ambiguities in this file. */ | ||
40 | __asm__ (".syntax no_register_prefix"); | ||
41 | |||
42 | void * | ||
43 | memcpy(void *pdst, const void *psrc, size_t pn) | ||
39 | { | 44 | { |
40 | /* Ok. Now we want the parameters put in special registers. | 45 | /* Now we want the parameters put in special registers. |
41 | Make sure the compiler is able to make something useful of this. | 46 | Make sure the compiler is able to make something useful of this. |
42 | As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). | 47 | As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). |
43 | 48 | ||
44 | If gcc was alright, it really would need no temporaries, and no | 49 | If gcc was allright, it really would need no temporaries, and no |
45 | stack space to save stuff on. */ | 50 | stack space to save stuff on. */ |
46 | 51 | ||
47 | register void *return_dst __asm__ ("r10") = pdst; | 52 | register void *return_dst __asm__ ("r10") = pdst; |
48 | register char *dst __asm__ ("r13") = pdst; | 53 | register unsigned char *dst __asm__ ("r13") = pdst; |
49 | register const char *src __asm__ ("r11") = psrc; | 54 | register unsigned const char *src __asm__ ("r11") = psrc; |
50 | register int n __asm__ ("r12") = pn; | 55 | register int n __asm__ ("r12") = pn; |
51 | 56 | ||
52 | |||
53 | /* When src is aligned but not dst, this makes a few extra needless | 57 | /* When src is aligned but not dst, this makes a few extra needless |
54 | cycles. I believe it would take as many to check that the | 58 | cycles. I believe it would take as many to check that the |
55 | re-alignment was unnecessary. */ | 59 | re-alignment was unnecessary. */ |
@@ -59,161 +63,174 @@ void *memcpy(void *pdst, | |||
59 | && n >= 3) | 63 | && n >= 3) |
60 | { | 64 | { |
61 | if ((unsigned long) dst & 1) | 65 | if ((unsigned long) dst & 1) |
62 | { | 66 | { |
63 | n--; | 67 | n--; |
64 | *(char*)dst = *(char*)src; | 68 | *dst = *src; |
65 | src++; | 69 | src++; |
66 | dst++; | 70 | dst++; |
67 | } | 71 | } |
68 | 72 | ||
69 | if ((unsigned long) dst & 2) | 73 | if ((unsigned long) dst & 2) |
70 | { | 74 | { |
71 | n -= 2; | 75 | n -= 2; |
72 | *(short*)dst = *(short*)src; | 76 | *(short *) dst = *(short *) src; |
73 | src += 2; | 77 | src += 2; |
74 | dst += 2; | 78 | dst += 2; |
75 | } | 79 | } |
76 | } | 80 | } |
77 | 81 | ||
78 | /* Decide which copying method to use. Movem is dirt cheap, so the | 82 | /* Decide which copying method to use. */ |
79 | overheap is low enough to always use the minimum block size as the | 83 | if (n >= MEMCPY_BY_BLOCK_THRESHOLD) |
80 | threshold. */ | 84 | { |
81 | if (n >= 44) | 85 | /* It is not optimal to tell the compiler about clobbering any |
82 | { | 86 | registers; that will move the saving/restoring of those registers |
83 | /* For large copies we use 'movem' */ | 87 | to the function prologue/epilogue, and make non-movem sizes |
84 | 88 | suboptimal. */ | |
85 | /* It is not optimal to tell the compiler about clobbering any | 89 | __asm__ volatile |
86 | registers; that will move the saving/restoring of those registers | 90 | ("\ |
87 | to the function prologue/epilogue, and make non-movem sizes | 91 | ;; GCC does promise correct register allocations, but let's \n\ |
88 | suboptimal. */ | 92 | ;; make sure it keeps its promises. \n\ |
89 | __asm__ volatile (" \n\ | 93 | .ifnc %0-%1-%2,$r13-$r11-$r12 \n\ |
90 | ;; Check that the register asm declaration got right. \n\ | 94 | .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\" \n\ |
91 | ;; The GCC manual explicitly says TRT will happen. \n\ | 95 | .endif \n\ |
92 | .ifnc %0-%1-%2,$r13-$r11-$r12 \n\ | ||
93 | .err \n\ | ||
94 | .endif \n\ | ||
95 | \n\ | ||
96 | ;; Save the registers we'll use in the movem process \n\ | ||
97 | \n\ | 96 | \n\ |
98 | ;; on the stack. \n\ | 97 | ;; Save the registers we'll use in the movem process \n\ |
99 | subq 11*4,$sp \n\ | 98 | ;; on the stack. \n\ |
100 | movem $r10,[$sp] \n\ | 99 | subq 11*4,sp \n\ |
100 | movem r10,[sp] \n\ | ||
101 | \n\ | 101 | \n\ |
102 | ;; Now we've got this: \n\ | 102 | ;; Now we've got this: \n\ |
103 | ;; r11 - src \n\ | 103 | ;; r11 - src \n\ |
104 | ;; r13 - dst \n\ | 104 | ;; r13 - dst \n\ |
105 | ;; r12 - n \n\ | 105 | ;; r12 - n \n\ |
106 | \n\ | 106 | \n\ |
107 | ;; Update n for the first loop \n\ | 107 | ;; Update n for the first loop. \n\ |
108 | subq 44,$r12 \n\ | 108 | subq 44,r12 \n\ |
109 | 0: \n\ | 109 | 0: \n\ |
110 | movem [$r11+],$r10 \n\ | 110 | " |
111 | subq 44,$r12 \n\ | 111 | #ifdef __arch_common_v10_v32 |
112 | bge 0b \n\ | 112 | /* Cater to branch offset difference between v32 and v10. We |
113 | movem $r10,[$r13+] \n\ | 113 | assume the branch below has an 8-bit offset. */ |
114 | " setf\n" | ||
115 | #endif | ||
116 | " movem [r11+],r10 \n\ | ||
117 | subq 44,r12 \n\ | ||
118 | bge 0b \n\ | ||
119 | movem r10,[r13+] \n\ | ||
114 | \n\ | 120 | \n\ |
115 | addq 44,$r12 ;; compensate for last loop underflowing n \n\ | 121 | ;; Compensate for last loop underflowing n. \n\ |
122 | addq 44,r12 \n\ | ||
116 | \n\ | 123 | \n\ |
117 | ;; Restore registers from stack \n\ | 124 | ;; Restore registers from stack. \n\ |
118 | movem [$sp+],$r10" | 125 | movem [sp+],r10" |
119 | 126 | ||
120 | /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n) | 127 | /* Outputs. */ |
121 | /* Inputs */ : "0" (dst), "1" (src), "2" (n)); | 128 | : "=r" (dst), "=r" (src), "=r" (n) |
122 | 129 | ||
123 | } | 130 | /* Inputs. */ |
131 | : "0" (dst), "1" (src), "2" (n)); | ||
132 | } | ||
124 | 133 | ||
125 | /* Either we directly starts copying, using dword copying | 134 | while (n >= 16) |
126 | in a loop, or we copy as much as possible with 'movem' | 135 | { |
127 | and then the last block (<44 bytes) is copied here. | 136 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
128 | This will work since 'movem' will have updated src,dst,n. */ | 137 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
138 | *(long *) dst = *(long *) src; dst += 4; src += 4; | ||
139 | *(long *) dst = *(long *) src; dst += 4; src += 4; | ||
129 | 140 | ||
130 | while ( n >= 16 ) | 141 | n -= 16; |
131 | { | 142 | } |
132 | *((long*)dst)++ = *((long*)src)++; | ||
133 | *((long*)dst)++ = *((long*)src)++; | ||
134 | *((long*)dst)++ = *((long*)src)++; | ||
135 | *((long*)dst)++ = *((long*)src)++; | ||
136 | n -= 16; | ||
137 | } | ||
138 | 143 | ||
139 | /* A switch() is definitely the fastest although it takes a LOT of code. | ||
140 | * Particularly if you inline code this. | ||
141 | */ | ||
142 | switch (n) | 144 | switch (n) |
143 | { | 145 | { |
144 | case 0: | 146 | case 0: |
145 | break; | 147 | break; |
148 | |||
146 | case 1: | 149 | case 1: |
147 | *(char*)dst = *(char*)src; | 150 | *dst = *src; |
148 | break; | 151 | break; |
152 | |||
149 | case 2: | 153 | case 2: |
150 | *(short*)dst = *(short*)src; | 154 | *(short *) dst = *(short *) src; |
151 | break; | 155 | break; |
156 | |||
152 | case 3: | 157 | case 3: |
153 | *((short*)dst)++ = *((short*)src)++; | 158 | *(short *) dst = *(short *) src; dst += 2; src += 2; |
154 | *(char*)dst = *(char*)src; | 159 | *dst = *src; |
155 | break; | 160 | break; |
161 | |||
156 | case 4: | 162 | case 4: |
157 | *((long*)dst)++ = *((long*)src)++; | 163 | *(long *) dst = *(long *) src; |
158 | break; | 164 | break; |
165 | |||
159 | case 5: | 166 | case 5: |
160 | *((long*)dst)++ = *((long*)src)++; | 167 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
161 | *(char*)dst = *(char*)src; | 168 | *dst = *src; |
162 | break; | 169 | break; |
170 | |||
163 | case 6: | 171 | case 6: |
164 | *((long*)dst)++ = *((long*)src)++; | 172 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
165 | *(short*)dst = *(short*)src; | 173 | *(short *) dst = *(short *) src; |
166 | break; | 174 | break; |
175 | |||
167 | case 7: | 176 | case 7: |
168 | *((long*)dst)++ = *((long*)src)++; | 177 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
169 | *((short*)dst)++ = *((short*)src)++; | 178 | *(short *) dst = *(short *) src; dst += 2; src += 2; |
170 | *(char*)dst = *(char*)src; | 179 | *dst = *src; |
171 | break; | 180 | break; |
181 | |||
172 | case 8: | 182 | case 8: |
173 | *((long*)dst)++ = *((long*)src)++; | 183 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
174 | *((long*)dst)++ = *((long*)src)++; | 184 | *(long *) dst = *(long *) src; |
175 | break; | 185 | break; |
186 | |||
176 | case 9: | 187 | case 9: |
177 | *((long*)dst)++ = *((long*)src)++; | 188 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
178 | *((long*)dst)++ = *((long*)src)++; | 189 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
179 | *(char*)dst = *(char*)src; | 190 | *dst = *src; |
180 | break; | 191 | break; |
192 | |||
181 | case 10: | 193 | case 10: |
182 | *((long*)dst)++ = *((long*)src)++; | 194 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
183 | *((long*)dst)++ = *((long*)src)++; | 195 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
184 | *(short*)dst = *(short*)src; | 196 | *(short *) dst = *(short *) src; |
185 | break; | 197 | break; |
198 | |||
186 | case 11: | 199 | case 11: |
187 | *((long*)dst)++ = *((long*)src)++; | 200 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
188 | *((long*)dst)++ = *((long*)src)++; | 201 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
189 | *((short*)dst)++ = *((short*)src)++; | 202 | *(short *) dst = *(short *) src; dst += 2; src += 2; |
190 | *(char*)dst = *(char*)src; | 203 | *dst = *src; |
191 | break; | 204 | break; |
205 | |||
192 | case 12: | 206 | case 12: |
193 | *((long*)dst)++ = *((long*)src)++; | 207 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
194 | *((long*)dst)++ = *((long*)src)++; | 208 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
195 | *((long*)dst)++ = *((long*)src)++; | 209 | *(long *) dst = *(long *) src; |
196 | break; | 210 | break; |
211 | |||
197 | case 13: | 212 | case 13: |
198 | *((long*)dst)++ = *((long*)src)++; | 213 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
199 | *((long*)dst)++ = *((long*)src)++; | 214 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
200 | *((long*)dst)++ = *((long*)src)++; | 215 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
201 | *(char*)dst = *(char*)src; | 216 | *dst = *src; |
202 | break; | 217 | break; |
218 | |||
203 | case 14: | 219 | case 14: |
204 | *((long*)dst)++ = *((long*)src)++; | 220 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
205 | *((long*)dst)++ = *((long*)src)++; | 221 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
206 | *((long*)dst)++ = *((long*)src)++; | 222 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
207 | *(short*)dst = *(short*)src; | 223 | *(short *) dst = *(short *) src; |
208 | break; | 224 | break; |
225 | |||
209 | case 15: | 226 | case 15: |
210 | *((long*)dst)++ = *((long*)src)++; | 227 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
211 | *((long*)dst)++ = *((long*)src)++; | 228 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
212 | *((long*)dst)++ = *((long*)src)++; | 229 | *(long *) dst = *(long *) src; dst += 4; src += 4; |
213 | *((short*)dst)++ = *((short*)src)++; | 230 | *(short *) dst = *(short *) src; dst += 2; src += 2; |
214 | *(char*)dst = *(char*)src; | 231 | *dst = *src; |
215 | break; | 232 | break; |
216 | } | 233 | } |
217 | 234 | ||
218 | return return_dst; /* destination pointer. */ | 235 | return return_dst; |
219 | } /* memcpy() */ | 236 | } |