aboutsummaryrefslogtreecommitdiffstats
path: root/arch/cris/arch-v32
diff options
context:
space:
mode:
Diffstat (limited to 'arch/cris/arch-v32')
-rw-r--r--arch/cris/arch-v32/lib/string.c325
1 files changed, 171 insertions, 154 deletions
diff --git a/arch/cris/arch-v32/lib/string.c b/arch/cris/arch-v32/lib/string.c
index 6740b2cebae5..c7bd6ebdc93c 100644
--- a/arch/cris/arch-v32/lib/string.c
+++ b/arch/cris/arch-v32/lib/string.c
@@ -1,55 +1,59 @@
1/*#************************************************************************#*/ 1/* A memcpy for CRIS.
2/*#-------------------------------------------------------------------------*/ 2 Copyright (C) 1994-2005 Axis Communications.
3/*# */ 3 All rights reserved.
4/*# FUNCTION NAME: memcpy() */ 4
5/*# */ 5 Redistribution and use in source and binary forms, with or without
6/*# PARAMETERS: void* dst; Destination address. */ 6 modification, are permitted provided that the following conditions
7/*# void* src; Source address. */ 7 are met:
8/*# int len; Number of bytes to copy. */ 8
9/*# */ 9 1. Redistributions of source code must retain the above copyright
10/*# RETURNS: dst. */ 10 notice, this list of conditions and the following disclaimer.
11/*# */ 11
12/*# DESCRIPTION: Copies len bytes of memory from src to dst. No guarantees */ 12 2. Neither the name of Axis Communications nor the names of its
13/*# about copying of overlapping memory areas. This routine is */ 13 contributors may be used to endorse or promote products derived
14/*# very sensitive to compiler changes in register allocation. */ 14 from this software without specific prior written permission.
15/*# Should really be rewritten to avoid this problem. */ 15
16/*# */ 16 THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS
17/*#-------------------------------------------------------------------------*/ 17 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18/*# */ 18 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19/*# HISTORY */ 19 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS
20/*# */ 20 COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
21/*# DATE NAME CHANGES */ 21 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22/*# ---- ---- ------- */ 22 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23/*# 941007 Kenny R Creation */ 23 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24/*# 941011 Kenny R Lots of optimizations and inlining. */ 24 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
25/*# 941129 Ulf A Adapted for use in libc. */ 25 STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
26/*# 950216 HP N==0 forgotten if non-aligned src/dst. */ 26 IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27/*# Added some optimizations. */ 27 POSSIBILITY OF SUCH DAMAGE. */
28/*# 001025 HP Make src and dst char *. Align dst to */ 28
29/*# dword, not just word-if-both-src-and-dst- */ 29/* FIXME: This file should really only be used for reference, as the
30/*# are-misaligned. */ 30 result is somewhat depending on gcc generating what we expect rather
31/*# */ 31 than what we describe. An assembly file should be used instead. */
32/*#-------------------------------------------------------------------------*/ 32
33 33#include <stddef.h>
34#include <linux/types.h> 34
35 35/* Break even between movem and move16 is really at 38.7 * 2, but
36void *memcpy(void *pdst, 36 modulo 44, so up to the next multiple of 44, we use ordinary code. */
37 const void *psrc, 37#define MEMCPY_BY_BLOCK_THRESHOLD (44 * 2)
38 size_t pn) 38
39/* No name ambiguities in this file. */
40__asm__ (".syntax no_register_prefix");
41
42void *
43memcpy(void *pdst, const void *psrc, size_t pn)
39{ 44{
40 /* Ok. Now we want the parameters put in special registers. 45 /* Now we want the parameters put in special registers.
41 Make sure the compiler is able to make something useful of this. 46 Make sure the compiler is able to make something useful of this.
42 As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop). 47 As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
43 48
44 If gcc was alright, it really would need no temporaries, and no 49 If gcc was allright, it really would need no temporaries, and no
45 stack space to save stuff on. */ 50 stack space to save stuff on. */
46 51
47 register void *return_dst __asm__ ("r10") = pdst; 52 register void *return_dst __asm__ ("r10") = pdst;
48 register char *dst __asm__ ("r13") = pdst; 53 register unsigned char *dst __asm__ ("r13") = pdst;
49 register const char *src __asm__ ("r11") = psrc; 54 register unsigned const char *src __asm__ ("r11") = psrc;
50 register int n __asm__ ("r12") = pn; 55 register int n __asm__ ("r12") = pn;
51 56
52
53 /* When src is aligned but not dst, this makes a few extra needless 57 /* When src is aligned but not dst, this makes a few extra needless
54 cycles. I believe it would take as many to check that the 58 cycles. I believe it would take as many to check that the
55 re-alignment was unnecessary. */ 59 re-alignment was unnecessary. */
@@ -59,161 +63,174 @@ void *memcpy(void *pdst,
59 && n >= 3) 63 && n >= 3)
60 { 64 {
61 if ((unsigned long) dst & 1) 65 if ((unsigned long) dst & 1)
62 { 66 {
63 n--; 67 n--;
64 *(char*)dst = *(char*)src; 68 *dst = *src;
65 src++; 69 src++;
66 dst++; 70 dst++;
67 } 71 }
68 72
69 if ((unsigned long) dst & 2) 73 if ((unsigned long) dst & 2)
70 { 74 {
71 n -= 2; 75 n -= 2;
72 *(short*)dst = *(short*)src; 76 *(short *) dst = *(short *) src;
73 src += 2; 77 src += 2;
74 dst += 2; 78 dst += 2;
75 } 79 }
76 } 80 }
77 81
78 /* Decide which copying method to use. Movem is dirt cheap, so the 82 /* Decide which copying method to use. */
79 overheap is low enough to always use the minimum block size as the 83 if (n >= MEMCPY_BY_BLOCK_THRESHOLD)
80 threshold. */ 84 {
81 if (n >= 44) 85 /* It is not optimal to tell the compiler about clobbering any
82 { 86 registers; that will move the saving/restoring of those registers
83 /* For large copies we use 'movem' */ 87 to the function prologue/epilogue, and make non-movem sizes
84 88 suboptimal. */
85 /* It is not optimal to tell the compiler about clobbering any 89 __asm__ volatile
86 registers; that will move the saving/restoring of those registers 90 ("\
87 to the function prologue/epilogue, and make non-movem sizes 91 ;; GCC does promise correct register allocations, but let's \n\
88 suboptimal. */ 92 ;; make sure it keeps its promises. \n\
89 __asm__ volatile (" \n\ 93 .ifnc %0-%1-%2,$r13-$r11-$r12 \n\
90 ;; Check that the register asm declaration got right. \n\ 94 .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\" \n\
91 ;; The GCC manual explicitly says TRT will happen. \n\ 95 .endif \n\
92 .ifnc %0-%1-%2,$r13-$r11-$r12 \n\
93 .err \n\
94 .endif \n\
95 \n\
96 ;; Save the registers we'll use in the movem process \n\
97 \n\ 96 \n\
98 ;; on the stack. \n\ 97 ;; Save the registers we'll use in the movem process \n\
99 subq 11*4,$sp \n\ 98 ;; on the stack. \n\
100 movem $r10,[$sp] \n\ 99 subq 11*4,sp \n\
100 movem r10,[sp] \n\
101 \n\ 101 \n\
102 ;; Now we've got this: \n\ 102 ;; Now we've got this: \n\
103 ;; r11 - src \n\ 103 ;; r11 - src \n\
104 ;; r13 - dst \n\ 104 ;; r13 - dst \n\
105 ;; r12 - n \n\ 105 ;; r12 - n \n\
106 \n\ 106 \n\
107 ;; Update n for the first loop \n\ 107 ;; Update n for the first loop. \n\
108 subq 44,$r12 \n\ 108 subq 44,r12 \n\
1090: \n\ 1090: \n\
110 movem [$r11+],$r10 \n\ 110"
111 subq 44,$r12 \n\ 111#ifdef __arch_common_v10_v32
112 bge 0b \n\ 112 /* Cater to branch offset difference between v32 and v10. We
113 movem $r10,[$r13+] \n\ 113 assume the branch below has an 8-bit offset. */
114" setf\n"
115#endif
116" movem [r11+],r10 \n\
117 subq 44,r12 \n\
118 bge 0b \n\
119 movem r10,[r13+] \n\
114 \n\ 120 \n\
115 addq 44,$r12 ;; compensate for last loop underflowing n \n\ 121 ;; Compensate for last loop underflowing n. \n\
122 addq 44,r12 \n\
116 \n\ 123 \n\
117 ;; Restore registers from stack \n\ 124 ;; Restore registers from stack. \n\
118 movem [$sp+],$r10" 125 movem [sp+],r10"
119 126
120 /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n) 127 /* Outputs. */
121 /* Inputs */ : "0" (dst), "1" (src), "2" (n)); 128 : "=r" (dst), "=r" (src), "=r" (n)
122 129
123 } 130 /* Inputs. */
131 : "0" (dst), "1" (src), "2" (n));
132 }
124 133
125 /* Either we directly starts copying, using dword copying 134 while (n >= 16)
126 in a loop, or we copy as much as possible with 'movem' 135 {
127 and then the last block (<44 bytes) is copied here. 136 *(long *) dst = *(long *) src; dst += 4; src += 4;
128 This will work since 'movem' will have updated src,dst,n. */ 137 *(long *) dst = *(long *) src; dst += 4; src += 4;
138 *(long *) dst = *(long *) src; dst += 4; src += 4;
139 *(long *) dst = *(long *) src; dst += 4; src += 4;
129 140
130 while ( n >= 16 ) 141 n -= 16;
131 { 142 }
132 *((long*)dst)++ = *((long*)src)++;
133 *((long*)dst)++ = *((long*)src)++;
134 *((long*)dst)++ = *((long*)src)++;
135 *((long*)dst)++ = *((long*)src)++;
136 n -= 16;
137 }
138 143
139 /* A switch() is definitely the fastest although it takes a LOT of code.
140 * Particularly if you inline code this.
141 */
142 switch (n) 144 switch (n)
143 { 145 {
144 case 0: 146 case 0:
145 break; 147 break;
148
146 case 1: 149 case 1:
147 *(char*)dst = *(char*)src; 150 *dst = *src;
148 break; 151 break;
152
149 case 2: 153 case 2:
150 *(short*)dst = *(short*)src; 154 *(short *) dst = *(short *) src;
151 break; 155 break;
156
152 case 3: 157 case 3:
153 *((short*)dst)++ = *((short*)src)++; 158 *(short *) dst = *(short *) src; dst += 2; src += 2;
154 *(char*)dst = *(char*)src; 159 *dst = *src;
155 break; 160 break;
161
156 case 4: 162 case 4:
157 *((long*)dst)++ = *((long*)src)++; 163 *(long *) dst = *(long *) src;
158 break; 164 break;
165
159 case 5: 166 case 5:
160 *((long*)dst)++ = *((long*)src)++; 167 *(long *) dst = *(long *) src; dst += 4; src += 4;
161 *(char*)dst = *(char*)src; 168 *dst = *src;
162 break; 169 break;
170
163 case 6: 171 case 6:
164 *((long*)dst)++ = *((long*)src)++; 172 *(long *) dst = *(long *) src; dst += 4; src += 4;
165 *(short*)dst = *(short*)src; 173 *(short *) dst = *(short *) src;
166 break; 174 break;
175
167 case 7: 176 case 7:
168 *((long*)dst)++ = *((long*)src)++; 177 *(long *) dst = *(long *) src; dst += 4; src += 4;
169 *((short*)dst)++ = *((short*)src)++; 178 *(short *) dst = *(short *) src; dst += 2; src += 2;
170 *(char*)dst = *(char*)src; 179 *dst = *src;
171 break; 180 break;
181
172 case 8: 182 case 8:
173 *((long*)dst)++ = *((long*)src)++; 183 *(long *) dst = *(long *) src; dst += 4; src += 4;
174 *((long*)dst)++ = *((long*)src)++; 184 *(long *) dst = *(long *) src;
175 break; 185 break;
186
176 case 9: 187 case 9:
177 *((long*)dst)++ = *((long*)src)++; 188 *(long *) dst = *(long *) src; dst += 4; src += 4;
178 *((long*)dst)++ = *((long*)src)++; 189 *(long *) dst = *(long *) src; dst += 4; src += 4;
179 *(char*)dst = *(char*)src; 190 *dst = *src;
180 break; 191 break;
192
181 case 10: 193 case 10:
182 *((long*)dst)++ = *((long*)src)++; 194 *(long *) dst = *(long *) src; dst += 4; src += 4;
183 *((long*)dst)++ = *((long*)src)++; 195 *(long *) dst = *(long *) src; dst += 4; src += 4;
184 *(short*)dst = *(short*)src; 196 *(short *) dst = *(short *) src;
185 break; 197 break;
198
186 case 11: 199 case 11:
187 *((long*)dst)++ = *((long*)src)++; 200 *(long *) dst = *(long *) src; dst += 4; src += 4;
188 *((long*)dst)++ = *((long*)src)++; 201 *(long *) dst = *(long *) src; dst += 4; src += 4;
189 *((short*)dst)++ = *((short*)src)++; 202 *(short *) dst = *(short *) src; dst += 2; src += 2;
190 *(char*)dst = *(char*)src; 203 *dst = *src;
191 break; 204 break;
205
192 case 12: 206 case 12:
193 *((long*)dst)++ = *((long*)src)++; 207 *(long *) dst = *(long *) src; dst += 4; src += 4;
194 *((long*)dst)++ = *((long*)src)++; 208 *(long *) dst = *(long *) src; dst += 4; src += 4;
195 *((long*)dst)++ = *((long*)src)++; 209 *(long *) dst = *(long *) src;
196 break; 210 break;
211
197 case 13: 212 case 13:
198 *((long*)dst)++ = *((long*)src)++; 213 *(long *) dst = *(long *) src; dst += 4; src += 4;
199 *((long*)dst)++ = *((long*)src)++; 214 *(long *) dst = *(long *) src; dst += 4; src += 4;
200 *((long*)dst)++ = *((long*)src)++; 215 *(long *) dst = *(long *) src; dst += 4; src += 4;
201 *(char*)dst = *(char*)src; 216 *dst = *src;
202 break; 217 break;
218
203 case 14: 219 case 14:
204 *((long*)dst)++ = *((long*)src)++; 220 *(long *) dst = *(long *) src; dst += 4; src += 4;
205 *((long*)dst)++ = *((long*)src)++; 221 *(long *) dst = *(long *) src; dst += 4; src += 4;
206 *((long*)dst)++ = *((long*)src)++; 222 *(long *) dst = *(long *) src; dst += 4; src += 4;
207 *(short*)dst = *(short*)src; 223 *(short *) dst = *(short *) src;
208 break; 224 break;
225
209 case 15: 226 case 15:
210 *((long*)dst)++ = *((long*)src)++; 227 *(long *) dst = *(long *) src; dst += 4; src += 4;
211 *((long*)dst)++ = *((long*)src)++; 228 *(long *) dst = *(long *) src; dst += 4; src += 4;
212 *((long*)dst)++ = *((long*)src)++; 229 *(long *) dst = *(long *) src; dst += 4; src += 4;
213 *((short*)dst)++ = *((short*)src)++; 230 *(short *) dst = *(short *) src; dst += 2; src += 2;
214 *(char*)dst = *(char*)src; 231 *dst = *src;
215 break; 232 break;
216 } 233 }
217 234
218 return return_dst; /* destination pointer. */ 235 return return_dst;
219} /* memcpy() */ 236}