diff options
-rw-r--r-- | arch/cris/arch-v10/lib/memset.c | 397 | ||||
-rw-r--r-- | arch/cris/arch-v32/lib/memset.c | 398 |
2 files changed, 404 insertions, 391 deletions
diff --git a/arch/cris/arch-v10/lib/memset.c b/arch/cris/arch-v10/lib/memset.c index 42c1101043a3..c94ea9b3ec29 100644 --- a/arch/cris/arch-v10/lib/memset.c +++ b/arch/cris/arch-v10/lib/memset.c | |||
@@ -1,252 +1,259 @@ | |||
1 | /*#************************************************************************#*/ | 1 | /* A memset for CRIS. |
2 | /*#-------------------------------------------------------------------------*/ | 2 | Copyright (C) 1999-2005 Axis Communications. |
3 | /*# */ | 3 | All rights reserved. |
4 | /*# FUNCTION NAME: memset() */ | 4 | |
5 | /*# */ | 5 | Redistribution and use in source and binary forms, with or without |
6 | /*# PARAMETERS: void* dst; Destination address. */ | 6 | modification, are permitted provided that the following conditions |
7 | /*# int c; Value of byte to write. */ | 7 | are met: |
8 | /*# int len; Number of bytes to write. */ | 8 | |
9 | /*# */ | 9 | 1. Redistributions of source code must retain the above copyright |
10 | /*# RETURNS: dst. */ | 10 | notice, this list of conditions and the following disclaimer. |
11 | /*# */ | 11 | |
12 | /*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */ | 12 | 2. Neither the name of Axis Communications nor the names of its |
13 | /*# Framework taken from memcpy. This routine is */ | 13 | contributors may be used to endorse or promote products derived |
14 | /*# very sensitive to compiler changes in register allocation. */ | 14 | from this software without specific prior written permission. |
15 | /*# Should really be rewritten to avoid this problem. */ | 15 | |
16 | /*# */ | 16 | THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS |
17 | /*#-------------------------------------------------------------------------*/ | 17 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
18 | /*# */ | 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
19 | /*# HISTORY */ | 19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS |
20 | /*# */ | 20 | COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, |
21 | /*# DATE NAME CHANGES */ | 21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
22 | /*# ---- ---- ------- */ | 22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
23 | /*# 990713 HP Tired of watching this function (or */ | 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
24 | /*# really, the nonoptimized generic */ | 24 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
25 | /*# implementation) take up 90% of simulator */ | 25 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING |
26 | /*# output. Measurements needed. */ | 26 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
27 | /*# */ | 27 | POSSIBILITY OF SUCH DAMAGE. */ |
28 | /*#-------------------------------------------------------------------------*/ | 28 | |
29 | 29 | /* FIXME: This file should really only be used for reference, as the | |
30 | #include <linux/types.h> | 30 | result is somewhat depending on gcc generating what we expect rather |
31 | 31 | than what we describe. An assembly file should be used instead. */ | |
32 | /* No, there's no macro saying 12*4, since it is "hard" to get it into | 32 | |
33 | the asm in a good way. Thus better to expose the problem everywhere. | 33 | /* Note the multiple occurrence of the expression "12*4", including the |
34 | */ | 34 | asm. It is hard to get it into the asm in a good way. Thus better to |
35 | 35 | expose the problem everywhere: no macro. */ | |
36 | /* Assuming 1 cycle per dword written or read (ok, not really true), and | 36 | |
37 | one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1) | 37 | /* Assuming one cycle per dword written or read (ok, not really true; the |
38 | so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */ | 38 | world is not ideal), and one cycle per instruction, then 43+3*(n/48-1) |
39 | 39 | <= 24+24*(n/48-1) so n >= 45.7; n >= 0.9; we win on the first full | |
40 | #define ZERO_BLOCK_SIZE (1*12*4) | 40 | 48-byte block to set. */ |
41 | 41 | ||
42 | void *memset(void *pdst, | 42 | #define MEMSET_BY_BLOCK_THRESHOLD (1 * 48) |
43 | int c, | 43 | |
44 | size_t plen) | 44 | /* No name ambiguities in this file. */ |
45 | __asm__ (".syntax no_register_prefix"); | ||
46 | |||
47 | void *memset(void *pdst, int c, unsigned int plen) | ||
45 | { | 48 | { |
46 | /* Ok. Now we want the parameters put in special registers. | 49 | /* Now we want the parameters in special registers. Make sure the |
47 | Make sure the compiler is able to make something useful of this. */ | 50 | compiler does something usable with this. */ |
48 | 51 | ||
49 | register char *return_dst __asm__ ("r10") = pdst; | 52 | register char *return_dst __asm__ ("r10") = pdst; |
50 | register int n __asm__ ("r12") = plen; | 53 | register int n __asm__ ("r12") = plen; |
51 | register int lc __asm__ ("r11") = c; | 54 | register int lc __asm__ ("r11") = c; |
52 | 55 | ||
53 | /* Most apps use memset sanely. Only those memsetting about 3..4 | 56 | /* Most apps use memset sanely. Memsetting about 3..4 bytes or less get |
54 | bytes or less get penalized compared to the generic implementation | 57 | penalized here compared to the generic implementation. */ |
55 | - and that's not really sane use. */ | ||
56 | 58 | ||
57 | /* Ugh. This is fragile at best. Check with newer GCC releases, if | 59 | /* This is fragile performancewise at best. Check with newer GCC |
58 | they compile cascaded "x |= x << 8" sanely! */ | 60 | releases, if they compile cascaded "x |= x << 8" to sane code. */ |
59 | __asm__("movu.b %0,$r13\n\t" | 61 | __asm__("movu.b %0,r13 \n\ |
60 | "lslq 8,$r13\n\t" | 62 | lslq 8,r13 \n\ |
61 | "move.b %0,$r13\n\t" | 63 | move.b %0,r13 \n\ |
62 | "move.d $r13,%0\n\t" | 64 | move.d r13,%0 \n\ |
63 | "lslq 16,$r13\n\t" | 65 | lslq 16,r13 \n\ |
64 | "or.d $r13,%0" | 66 | or.d r13,%0" |
65 | : "=r" (lc) : "0" (lc) : "r13"); | 67 | : "=r" (lc) /* Inputs. */ |
68 | : "0" (lc) /* Outputs. */ | ||
69 | : "r13"); /* Trash. */ | ||
66 | 70 | ||
67 | { | 71 | { |
68 | register char *dst __asm__ ("r13") = pdst; | 72 | register char *dst __asm__ ("r13") = pdst; |
69 | 73 | ||
70 | /* This is NONPORTABLE, but since this whole routine is */ | 74 | if (((unsigned long) pdst & 3) != 0 |
71 | /* grossly nonportable that doesn't matter. */ | 75 | /* Oops! n = 0 must be a valid call, regardless of alignment. */ |
76 | && n >= 3) | ||
77 | { | ||
78 | if ((unsigned long) dst & 1) | ||
79 | { | ||
80 | *dst = (char) lc; | ||
81 | n--; | ||
82 | dst++; | ||
83 | } | ||
72 | 84 | ||
73 | if (((unsigned long) pdst & 3) != 0 | 85 | if ((unsigned long) dst & 2) |
74 | /* Oops! n=0 must be a legal call, regardless of alignment. */ | 86 | { |
75 | && n >= 3) | 87 | *(short *) dst = lc; |
76 | { | 88 | n -= 2; |
77 | if ((unsigned long)dst & 1) | 89 | dst += 2; |
78 | { | 90 | } |
79 | *dst = (char) lc; | 91 | } |
80 | n--; | ||
81 | dst++; | ||
82 | } | ||
83 | |||
84 | if ((unsigned long)dst & 2) | ||
85 | { | ||
86 | *(short *)dst = lc; | ||
87 | n -= 2; | ||
88 | dst += 2; | ||
89 | } | ||
90 | } | ||
91 | 92 | ||
92 | /* Now the fun part. For the threshold value of this, check the equation | 93 | /* Decide which setting method to use. */ |
93 | above. */ | 94 | if (n >= MEMSET_BY_BLOCK_THRESHOLD) |
94 | /* Decide which copying method to use. */ | 95 | { |
95 | if (n >= ZERO_BLOCK_SIZE) | 96 | /* It is not optimal to tell the compiler about clobbering any |
96 | { | 97 | registers; that will move the saving/restoring of those registers |
97 | /* For large copies we use 'movem' */ | 98 | to the function prologue/epilogue, and make non-block sizes |
98 | 99 | suboptimal. */ | |
99 | /* It is not optimal to tell the compiler about clobbering any | 100 | __asm__ volatile |
100 | registers; that will move the saving/restoring of those registers | 101 | ("\ |
101 | to the function prologue/epilogue, and make non-movem sizes | 102 | ;; GCC does promise correct register allocations, but let's \n\ |
102 | suboptimal. | 103 | ;; make sure it keeps its promises. \n\ |
103 | 104 | .ifnc %0-%1-%4,$r13-$r12-$r11 \n\ | |
104 | This method is not foolproof; it assumes that the "asm reg" | 105 | .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\" \n\ |
105 | declarations at the beginning of the function really are used | 106 | .endif \n\ |
106 | here (beware: they may be moved to temporary registers). | 107 | \n\ |
107 | This way, we do not have to save/move the registers around into | 108 | ;; Save the registers we'll clobber in the movem process \n\ |
108 | temporaries; we can safely use them straight away. | 109 | ;; on the stack. Don't mention them to gcc, it will only be \n\ |
109 | 110 | ;; upset. \n\ | |
110 | If you want to check that the allocation was right; then | 111 | subq 11*4,sp \n\ |
111 | check the equalities in the first comment. It should say | 112 | movem r10,[sp] \n\ |
112 | "r13=r13, r12=r12, r11=r11" */ | ||
113 | __asm__ volatile ("\n\ | ||
114 | ;; Check that the following is true (same register names on \n\ | ||
115 | ;; both sides of equal sign, as in r8=r8): \n\ | ||
116 | ;; %0=r13, %1=r12, %4=r11 \n\ | ||
117 | ;; \n\ | ||
118 | ;; Save the registers we'll clobber in the movem process \n\ | ||
119 | ;; on the stack. Don't mention them to gcc, it will only be \n\ | ||
120 | ;; upset. \n\ | ||
121 | subq 11*4,$sp \n\ | ||
122 | movem $r10,[$sp] \n\ | ||
123 | \n\ | 113 | \n\ |
124 | move.d $r11,$r0 \n\ | 114 | move.d r11,r0 \n\ |
125 | move.d $r11,$r1 \n\ | 115 | move.d r11,r1 \n\ |
126 | move.d $r11,$r2 \n\ | 116 | move.d r11,r2 \n\ |
127 | move.d $r11,$r3 \n\ | 117 | move.d r11,r3 \n\ |
128 | move.d $r11,$r4 \n\ | 118 | move.d r11,r4 \n\ |
129 | move.d $r11,$r5 \n\ | 119 | move.d r11,r5 \n\ |
130 | move.d $r11,$r6 \n\ | 120 | move.d r11,r6 \n\ |
131 | move.d $r11,$r7 \n\ | 121 | move.d r11,r7 \n\ |
132 | move.d $r11,$r8 \n\ | 122 | move.d r11,r8 \n\ |
133 | move.d $r11,$r9 \n\ | 123 | move.d r11,r9 \n\ |
134 | move.d $r11,$r10 \n\ | 124 | move.d r11,r10 \n\ |
135 | \n\ | 125 | \n\ |
136 | ;; Now we've got this: \n\ | 126 | ;; Now we've got this: \n\ |
137 | ;; r13 - dst \n\ | 127 | ;; r13 - dst \n\ |
138 | ;; r12 - n \n\ | 128 | ;; r12 - n \n\ |
139 | \n\ | 129 | \n\ |
140 | ;; Update n for the first loop \n\ | 130 | ;; Update n for the first loop \n\ |
141 | subq 12*4,$r12 \n\ | 131 | subq 12*4,r12 \n\ |
142 | 0: \n\ | 132 | 0: \n\ |
143 | subq 12*4,$r12 \n\ | 133 | " |
144 | bge 0b \n\ | 134 | #ifdef __arch_common_v10_v32 |
145 | movem $r11,[$r13+] \n\ | 135 | /* Cater to branch offset difference between v32 and v10. We |
136 | assume the branch below has an 8-bit offset. */ | ||
137 | " setf\n" | ||
138 | #endif | ||
139 | " subq 12*4,r12 \n\ | ||
140 | bge 0b \n\ | ||
141 | movem r11,[r13+] \n\ | ||
146 | \n\ | 142 | \n\ |
147 | addq 12*4,$r12 ;; compensate for last loop underflowing n \n\ | 143 | ;; Compensate for last loop underflowing n. \n\ |
144 | addq 12*4,r12 \n\ | ||
148 | \n\ | 145 | \n\ |
149 | ;; Restore registers from stack \n\ | 146 | ;; Restore registers from stack. \n\ |
150 | movem [$sp+],$r10" | 147 | movem [sp+],r10" |
151 | 148 | ||
152 | /* Outputs */ : "=r" (dst), "=r" (n) | 149 | /* Outputs. */ |
153 | /* Inputs */ : "0" (dst), "1" (n), "r" (lc)); | 150 | : "=r" (dst), "=r" (n) |
154 | 151 | ||
155 | } | 152 | /* Inputs. */ |
153 | : "0" (dst), "1" (n), "r" (lc)); | ||
154 | } | ||
155 | |||
156 | /* An ad-hoc unroll, used for 4*12-1..16 bytes. */ | ||
157 | while (n >= 16) | ||
158 | { | ||
159 | *(long *) dst = lc; dst += 4; | ||
160 | *(long *) dst = lc; dst += 4; | ||
161 | *(long *) dst = lc; dst += 4; | ||
162 | *(long *) dst = lc; dst += 4; | ||
163 | n -= 16; | ||
164 | } | ||
156 | 165 | ||
157 | /* Either we directly starts copying, using dword copying | ||
158 | in a loop, or we copy as much as possible with 'movem' | ||
159 | and then the last block (<44 bytes) is copied here. | ||
160 | This will work since 'movem' will have updated src,dst,n. */ | ||
161 | |||
162 | while ( n >= 16 ) | ||
163 | { | ||
164 | *((long*)dst)++ = lc; | ||
165 | *((long*)dst)++ = lc; | ||
166 | *((long*)dst)++ = lc; | ||
167 | *((long*)dst)++ = lc; | ||
168 | n -= 16; | ||
169 | } | ||
170 | |||
171 | /* A switch() is definitely the fastest although it takes a LOT of code. | ||
172 | * Particularly if you inline code this. | ||
173 | */ | ||
174 | switch (n) | 166 | switch (n) |
175 | { | 167 | { |
176 | case 0: | 168 | case 0: |
177 | break; | 169 | break; |
170 | |||
178 | case 1: | 171 | case 1: |
179 | *(char*)dst = (char) lc; | 172 | *dst = (char) lc; |
180 | break; | 173 | break; |
174 | |||
181 | case 2: | 175 | case 2: |
182 | *(short*)dst = (short) lc; | 176 | *(short *) dst = (short) lc; |
183 | break; | 177 | break; |
178 | |||
184 | case 3: | 179 | case 3: |
185 | *((short*)dst)++ = (short) lc; | 180 | *(short *) dst = (short) lc; dst += 2; |
186 | *(char*)dst = (char) lc; | 181 | *dst = (char) lc; |
187 | break; | 182 | break; |
183 | |||
188 | case 4: | 184 | case 4: |
189 | *((long*)dst)++ = lc; | 185 | *(long *) dst = lc; |
190 | break; | 186 | break; |
187 | |||
191 | case 5: | 188 | case 5: |
192 | *((long*)dst)++ = lc; | 189 | *(long *) dst = lc; dst += 4; |
193 | *(char*)dst = (char) lc; | 190 | *dst = (char) lc; |
194 | break; | 191 | break; |
192 | |||
195 | case 6: | 193 | case 6: |
196 | *((long*)dst)++ = lc; | 194 | *(long *) dst = lc; dst += 4; |
197 | *(short*)dst = (short) lc; | 195 | *(short *) dst = (short) lc; |
198 | break; | 196 | break; |
197 | |||
199 | case 7: | 198 | case 7: |
200 | *((long*)dst)++ = lc; | 199 | *(long *) dst = lc; dst += 4; |
201 | *((short*)dst)++ = (short) lc; | 200 | *(short *) dst = (short) lc; dst += 2; |
202 | *(char*)dst = (char) lc; | 201 | *dst = (char) lc; |
203 | break; | 202 | break; |
203 | |||
204 | case 8: | 204 | case 8: |
205 | *((long*)dst)++ = lc; | 205 | *(long *) dst = lc; dst += 4; |
206 | *((long*)dst)++ = lc; | 206 | *(long *) dst = lc; |
207 | break; | 207 | break; |
208 | |||
208 | case 9: | 209 | case 9: |
209 | *((long*)dst)++ = lc; | 210 | *(long *) dst = lc; dst += 4; |
210 | *((long*)dst)++ = lc; | 211 | *(long *) dst = lc; dst += 4; |
211 | *(char*)dst = (char) lc; | 212 | *dst = (char) lc; |
212 | break; | 213 | break; |
214 | |||
213 | case 10: | 215 | case 10: |
214 | *((long*)dst)++ = lc; | 216 | *(long *) dst = lc; dst += 4; |
215 | *((long*)dst)++ = lc; | 217 | *(long *) dst = lc; dst += 4; |
216 | *(short*)dst = (short) lc; | 218 | *(short *) dst = (short) lc; |
217 | break; | 219 | break; |
220 | |||
218 | case 11: | 221 | case 11: |
219 | *((long*)dst)++ = lc; | 222 | *(long *) dst = lc; dst += 4; |
220 | *((long*)dst)++ = lc; | 223 | *(long *) dst = lc; dst += 4; |
221 | *((short*)dst)++ = (short) lc; | 224 | *(short *) dst = (short) lc; dst += 2; |
222 | *(char*)dst = (char) lc; | 225 | *dst = (char) lc; |
223 | break; | 226 | break; |
227 | |||
224 | case 12: | 228 | case 12: |
225 | *((long*)dst)++ = lc; | 229 | *(long *) dst = lc; dst += 4; |
226 | *((long*)dst)++ = lc; | 230 | *(long *) dst = lc; dst += 4; |
227 | *((long*)dst)++ = lc; | 231 | *(long *) dst = lc; |
228 | break; | 232 | break; |
233 | |||
229 | case 13: | 234 | case 13: |
230 | *((long*)dst)++ = lc; | 235 | *(long *) dst = lc; dst += 4; |
231 | *((long*)dst)++ = lc; | 236 | *(long *) dst = lc; dst += 4; |
232 | *((long*)dst)++ = lc; | 237 | *(long *) dst = lc; dst += 4; |
233 | *(char*)dst = (char) lc; | 238 | *dst = (char) lc; |
234 | break; | 239 | break; |
240 | |||
235 | case 14: | 241 | case 14: |
236 | *((long*)dst)++ = lc; | 242 | *(long *) dst = lc; dst += 4; |
237 | *((long*)dst)++ = lc; | 243 | *(long *) dst = lc; dst += 4; |
238 | *((long*)dst)++ = lc; | 244 | *(long *) dst = lc; dst += 4; |
239 | *(short*)dst = (short) lc; | 245 | *(short *) dst = (short) lc; |
240 | break; | 246 | break; |
247 | |||
241 | case 15: | 248 | case 15: |
242 | *((long*)dst)++ = lc; | 249 | *(long *) dst = lc; dst += 4; |
243 | *((long*)dst)++ = lc; | 250 | *(long *) dst = lc; dst += 4; |
244 | *((long*)dst)++ = lc; | 251 | *(long *) dst = lc; dst += 4; |
245 | *((short*)dst)++ = (short) lc; | 252 | *(short *) dst = (short) lc; dst += 2; |
246 | *(char*)dst = (char) lc; | 253 | *dst = (char) lc; |
247 | break; | 254 | break; |
248 | } | 255 | } |
249 | } | 256 | } |
250 | 257 | ||
251 | return return_dst; /* destination pointer. */ | 258 | return return_dst; |
252 | } /* memset() */ | 259 | } |
diff --git a/arch/cris/arch-v32/lib/memset.c b/arch/cris/arch-v32/lib/memset.c index ffca1214674e..c94ea9b3ec29 100644 --- a/arch/cris/arch-v32/lib/memset.c +++ b/arch/cris/arch-v32/lib/memset.c | |||
@@ -1,253 +1,259 @@ | |||
1 | /*#************************************************************************#*/ | 1 | /* A memset for CRIS. |
2 | /*#-------------------------------------------------------------------------*/ | 2 | Copyright (C) 1999-2005 Axis Communications. |
3 | /*# */ | 3 | All rights reserved. |
4 | /*# FUNCTION NAME: memset() */ | 4 | |
5 | /*# */ | 5 | Redistribution and use in source and binary forms, with or without |
6 | /*# PARAMETERS: void* dst; Destination address. */ | 6 | modification, are permitted provided that the following conditions |
7 | /*# int c; Value of byte to write. */ | 7 | are met: |
8 | /*# int len; Number of bytes to write. */ | 8 | |
9 | /*# */ | 9 | 1. Redistributions of source code must retain the above copyright |
10 | /*# RETURNS: dst. */ | 10 | notice, this list of conditions and the following disclaimer. |
11 | /*# */ | 11 | |
12 | /*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */ | 12 | 2. Neither the name of Axis Communications nor the names of its |
13 | /*# Framework taken from memcpy. This routine is */ | 13 | contributors may be used to endorse or promote products derived |
14 | /*# very sensitive to compiler changes in register allocation. */ | 14 | from this software without specific prior written permission. |
15 | /*# Should really be rewritten to avoid this problem. */ | 15 | |
16 | /*# */ | 16 | THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS |
17 | /*#-------------------------------------------------------------------------*/ | 17 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
18 | /*# */ | 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
19 | /*# HISTORY */ | 19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS |
20 | /*# */ | 20 | COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, |
21 | /*# DATE NAME CHANGES */ | 21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
22 | /*# ---- ---- ------- */ | 22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
23 | /*# 990713 HP Tired of watching this function (or */ | 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
24 | /*# really, the nonoptimized generic */ | 24 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
25 | /*# implementation) take up 90% of simulator */ | 25 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING |
26 | /*# output. Measurements needed. */ | 26 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
27 | /*# */ | 27 | POSSIBILITY OF SUCH DAMAGE. */ |
28 | /*#-------------------------------------------------------------------------*/ | 28 | |
29 | 29 | /* FIXME: This file should really only be used for reference, as the | |
30 | #include <linux/types.h> | 30 | result is somewhat depending on gcc generating what we expect rather |
31 | 31 | than what we describe. An assembly file should be used instead. */ | |
32 | /* No, there's no macro saying 12*4, since it is "hard" to get it into | 32 | |
33 | the asm in a good way. Thus better to expose the problem everywhere. | 33 | /* Note the multiple occurrence of the expression "12*4", including the |
34 | */ | 34 | asm. It is hard to get it into the asm in a good way. Thus better to |
35 | 35 | expose the problem everywhere: no macro. */ | |
36 | /* Assuming 1 cycle per dword written or read (ok, not really true), and | 36 | |
37 | one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1) | 37 | /* Assuming one cycle per dword written or read (ok, not really true; the |
38 | so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */ | 38 | world is not ideal), and one cycle per instruction, then 43+3*(n/48-1) |
39 | 39 | <= 24+24*(n/48-1) so n >= 45.7; n >= 0.9; we win on the first full | |
40 | #define ZERO_BLOCK_SIZE (1*12*4) | 40 | 48-byte block to set. */ |
41 | 41 | ||
42 | void *memset(void *pdst, | 42 | #define MEMSET_BY_BLOCK_THRESHOLD (1 * 48) |
43 | int c, | 43 | |
44 | size_t plen) | 44 | /* No name ambiguities in this file. */ |
45 | __asm__ (".syntax no_register_prefix"); | ||
46 | |||
47 | void *memset(void *pdst, int c, unsigned int plen) | ||
45 | { | 48 | { |
46 | /* Ok. Now we want the parameters put in special registers. | 49 | /* Now we want the parameters in special registers. Make sure the |
47 | Make sure the compiler is able to make something useful of this. */ | 50 | compiler does something usable with this. */ |
48 | 51 | ||
49 | register char *return_dst __asm__ ("r10") = pdst; | 52 | register char *return_dst __asm__ ("r10") = pdst; |
50 | register int n __asm__ ("r12") = plen; | 53 | register int n __asm__ ("r12") = plen; |
51 | register int lc __asm__ ("r11") = c; | 54 | register int lc __asm__ ("r11") = c; |
52 | 55 | ||
53 | /* Most apps use memset sanely. Only those memsetting about 3..4 | 56 | /* Most apps use memset sanely. Memsetting about 3..4 bytes or less get |
54 | bytes or less get penalized compared to the generic implementation | 57 | penalized here compared to the generic implementation. */ |
55 | - and that's not really sane use. */ | ||
56 | 58 | ||
57 | /* Ugh. This is fragile at best. Check with newer GCC releases, if | 59 | /* This is fragile performancewise at best. Check with newer GCC |
58 | they compile cascaded "x |= x << 8" sanely! */ | 60 | releases, if they compile cascaded "x |= x << 8" to sane code. */ |
59 | __asm__("movu.b %0,$r13 \n\ | 61 | __asm__("movu.b %0,r13 \n\ |
60 | lslq 8,$r13 \n\ | 62 | lslq 8,r13 \n\ |
61 | move.b %0,$r13 \n\ | 63 | move.b %0,r13 \n\ |
62 | move.d $r13,%0 \n\ | 64 | move.d r13,%0 \n\ |
63 | lslq 16,$r13 \n\ | 65 | lslq 16,r13 \n\ |
64 | or.d $r13,%0" | 66 | or.d r13,%0" |
65 | : "=r" (lc) : "0" (lc) : "r13"); | 67 | : "=r" (lc) /* Inputs. */ |
68 | : "0" (lc) /* Outputs. */ | ||
69 | : "r13"); /* Trash. */ | ||
66 | 70 | ||
67 | { | 71 | { |
68 | register char *dst __asm__ ("r13") = pdst; | 72 | register char *dst __asm__ ("r13") = pdst; |
69 | 73 | ||
70 | /* This is NONPORTABLE, but since this whole routine is */ | 74 | if (((unsigned long) pdst & 3) != 0 |
71 | /* grossly nonportable that doesn't matter. */ | 75 | /* Oops! n = 0 must be a valid call, regardless of alignment. */ |
76 | && n >= 3) | ||
77 | { | ||
78 | if ((unsigned long) dst & 1) | ||
79 | { | ||
80 | *dst = (char) lc; | ||
81 | n--; | ||
82 | dst++; | ||
83 | } | ||
72 | 84 | ||
73 | if (((unsigned long) pdst & 3) != 0 | 85 | if ((unsigned long) dst & 2) |
74 | /* Oops! n=0 must be a legal call, regardless of alignment. */ | 86 | { |
75 | && n >= 3) | 87 | *(short *) dst = lc; |
76 | { | 88 | n -= 2; |
77 | if ((unsigned long)dst & 1) | 89 | dst += 2; |
78 | { | 90 | } |
79 | *dst = (char) lc; | 91 | } |
80 | n--; | ||
81 | dst++; | ||
82 | } | ||
83 | |||
84 | if ((unsigned long)dst & 2) | ||
85 | { | ||
86 | *(short *)dst = lc; | ||
87 | n -= 2; | ||
88 | dst += 2; | ||
89 | } | ||
90 | } | ||
91 | 92 | ||
92 | /* Now the fun part. For the threshold value of this, check the equation | 93 | /* Decide which setting method to use. */ |
93 | above. */ | 94 | if (n >= MEMSET_BY_BLOCK_THRESHOLD) |
94 | /* Decide which copying method to use. */ | 95 | { |
95 | if (n >= ZERO_BLOCK_SIZE) | 96 | /* It is not optimal to tell the compiler about clobbering any |
96 | { | 97 | registers; that will move the saving/restoring of those registers |
97 | /* For large copies we use 'movem' */ | 98 | to the function prologue/epilogue, and make non-block sizes |
98 | 99 | suboptimal. */ | |
99 | /* It is not optimal to tell the compiler about clobbering any | 100 | __asm__ volatile |
100 | registers; that will move the saving/restoring of those registers | 101 | ("\ |
101 | to the function prologue/epilogue, and make non-movem sizes | 102 | ;; GCC does promise correct register allocations, but let's \n\ |
102 | suboptimal. | 103 | ;; make sure it keeps its promises. \n\ |
103 | 104 | .ifnc %0-%1-%4,$r13-$r12-$r11 \n\ | |
104 | This method is not foolproof; it assumes that the "asm reg" | 105 | .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\" \n\ |
105 | declarations at the beginning of the function really are used | 106 | .endif \n\ |
106 | here (beware: they may be moved to temporary registers). | ||
107 | This way, we do not have to save/move the registers around into | ||
108 | temporaries; we can safely use them straight away. | ||
109 | |||
110 | If you want to check that the allocation was right; then | ||
111 | check the equalities in the first comment. It should say | ||
112 | "r13=r13, r12=r12, r11=r11" */ | ||
113 | __asm__ volatile (" \n\ | ||
114 | ;; Check that the register asm declaration got right. \n\ | ||
115 | ;; The GCC manual says it will work, but there *has* been bugs. \n\ | ||
116 | .ifnc %0-%1-%4,$r13-$r12-$r11 \n\ | ||
117 | .err \n\ | ||
118 | .endif \n\ | ||
119 | \n\ | 107 | \n\ |
120 | ;; Save the registers we'll clobber in the movem process \n\ | 108 | ;; Save the registers we'll clobber in the movem process \n\ |
121 | ;; on the stack. Don't mention them to gcc, it will only be \n\ | 109 | ;; on the stack. Don't mention them to gcc, it will only be \n\ |
122 | ;; upset. \n\ | 110 | ;; upset. \n\ |
123 | subq 11*4,$sp \n\ | 111 | subq 11*4,sp \n\ |
124 | movem $r10,[$sp] \n\ | 112 | movem r10,[sp] \n\ |
125 | \n\ | 113 | \n\ |
126 | move.d $r11,$r0 \n\ | 114 | move.d r11,r0 \n\ |
127 | move.d $r11,$r1 \n\ | 115 | move.d r11,r1 \n\ |
128 | move.d $r11,$r2 \n\ | 116 | move.d r11,r2 \n\ |
129 | move.d $r11,$r3 \n\ | 117 | move.d r11,r3 \n\ |
130 | move.d $r11,$r4 \n\ | 118 | move.d r11,r4 \n\ |
131 | move.d $r11,$r5 \n\ | 119 | move.d r11,r5 \n\ |
132 | move.d $r11,$r6 \n\ | 120 | move.d r11,r6 \n\ |
133 | move.d $r11,$r7 \n\ | 121 | move.d r11,r7 \n\ |
134 | move.d $r11,$r8 \n\ | 122 | move.d r11,r8 \n\ |
135 | move.d $r11,$r9 \n\ | 123 | move.d r11,r9 \n\ |
136 | move.d $r11,$r10 \n\ | 124 | move.d r11,r10 \n\ |
137 | \n\ | 125 | \n\ |
138 | ;; Now we've got this: \n\ | 126 | ;; Now we've got this: \n\ |
139 | ;; r13 - dst \n\ | 127 | ;; r13 - dst \n\ |
140 | ;; r12 - n \n\ | 128 | ;; r12 - n \n\ |
141 | \n\ | 129 | \n\ |
142 | ;; Update n for the first loop \n\ | 130 | ;; Update n for the first loop \n\ |
143 | subq 12*4,$r12 \n\ | 131 | subq 12*4,r12 \n\ |
144 | 0: \n\ | 132 | 0: \n\ |
145 | subq 12*4,$r12 \n\ | 133 | " |
146 | bge 0b \n\ | 134 | #ifdef __arch_common_v10_v32 |
147 | movem $r11,[$r13+] \n\ | 135 | /* Cater to branch offset difference between v32 and v10. We |
136 | assume the branch below has an 8-bit offset. */ | ||
137 | " setf\n" | ||
138 | #endif | ||
139 | " subq 12*4,r12 \n\ | ||
140 | bge 0b \n\ | ||
141 | movem r11,[r13+] \n\ | ||
148 | \n\ | 142 | \n\ |
149 | addq 12*4,$r12 ;; compensate for last loop underflowing n \n\ | 143 | ;; Compensate for last loop underflowing n. \n\ |
144 | addq 12*4,r12 \n\ | ||
150 | \n\ | 145 | \n\ |
151 | ;; Restore registers from stack \n\ | 146 | ;; Restore registers from stack. \n\ |
152 | movem [$sp+],$r10" | 147 | movem [sp+],r10" |
153 | 148 | ||
154 | /* Outputs */ : "=r" (dst), "=r" (n) | 149 | /* Outputs. */ |
155 | /* Inputs */ : "0" (dst), "1" (n), "r" (lc)); | 150 | : "=r" (dst), "=r" (n) |
156 | } | 151 | |
152 | /* Inputs. */ | ||
153 | : "0" (dst), "1" (n), "r" (lc)); | ||
154 | } | ||
155 | |||
156 | /* An ad-hoc unroll, used for 4*12-1..16 bytes. */ | ||
157 | while (n >= 16) | ||
158 | { | ||
159 | *(long *) dst = lc; dst += 4; | ||
160 | *(long *) dst = lc; dst += 4; | ||
161 | *(long *) dst = lc; dst += 4; | ||
162 | *(long *) dst = lc; dst += 4; | ||
163 | n -= 16; | ||
164 | } | ||
157 | 165 | ||
158 | /* Either we directly starts copying, using dword copying | ||
159 | in a loop, or we copy as much as possible with 'movem' | ||
160 | and then the last block (<44 bytes) is copied here. | ||
161 | This will work since 'movem' will have updated src,dst,n. */ | ||
162 | |||
163 | while ( n >= 16 ) | ||
164 | { | ||
165 | *((long*)dst)++ = lc; | ||
166 | *((long*)dst)++ = lc; | ||
167 | *((long*)dst)++ = lc; | ||
168 | *((long*)dst)++ = lc; | ||
169 | n -= 16; | ||
170 | } | ||
171 | |||
172 | /* A switch() is definitely the fastest although it takes a LOT of code. | ||
173 | * Particularly if you inline code this. | ||
174 | */ | ||
175 | switch (n) | 166 | switch (n) |
176 | { | 167 | { |
177 | case 0: | 168 | case 0: |
178 | break; | 169 | break; |
170 | |||
179 | case 1: | 171 | case 1: |
180 | *(char*)dst = (char) lc; | 172 | *dst = (char) lc; |
181 | break; | 173 | break; |
174 | |||
182 | case 2: | 175 | case 2: |
183 | *(short*)dst = (short) lc; | 176 | *(short *) dst = (short) lc; |
184 | break; | 177 | break; |
178 | |||
185 | case 3: | 179 | case 3: |
186 | *((short*)dst)++ = (short) lc; | 180 | *(short *) dst = (short) lc; dst += 2; |
187 | *(char*)dst = (char) lc; | 181 | *dst = (char) lc; |
188 | break; | 182 | break; |
183 | |||
189 | case 4: | 184 | case 4: |
190 | *((long*)dst)++ = lc; | 185 | *(long *) dst = lc; |
191 | break; | 186 | break; |
187 | |||
192 | case 5: | 188 | case 5: |
193 | *((long*)dst)++ = lc; | 189 | *(long *) dst = lc; dst += 4; |
194 | *(char*)dst = (char) lc; | 190 | *dst = (char) lc; |
195 | break; | 191 | break; |
192 | |||
196 | case 6: | 193 | case 6: |
197 | *((long*)dst)++ = lc; | 194 | *(long *) dst = lc; dst += 4; |
198 | *(short*)dst = (short) lc; | 195 | *(short *) dst = (short) lc; |
199 | break; | 196 | break; |
197 | |||
200 | case 7: | 198 | case 7: |
201 | *((long*)dst)++ = lc; | 199 | *(long *) dst = lc; dst += 4; |
202 | *((short*)dst)++ = (short) lc; | 200 | *(short *) dst = (short) lc; dst += 2; |
203 | *(char*)dst = (char) lc; | 201 | *dst = (char) lc; |
204 | break; | 202 | break; |
203 | |||
205 | case 8: | 204 | case 8: |
206 | *((long*)dst)++ = lc; | 205 | *(long *) dst = lc; dst += 4; |
207 | *((long*)dst)++ = lc; | 206 | *(long *) dst = lc; |
208 | break; | 207 | break; |
208 | |||
209 | case 9: | 209 | case 9: |
210 | *((long*)dst)++ = lc; | 210 | *(long *) dst = lc; dst += 4; |
211 | *((long*)dst)++ = lc; | 211 | *(long *) dst = lc; dst += 4; |
212 | *(char*)dst = (char) lc; | 212 | *dst = (char) lc; |
213 | break; | 213 | break; |
214 | |||
214 | case 10: | 215 | case 10: |
215 | *((long*)dst)++ = lc; | 216 | *(long *) dst = lc; dst += 4; |
216 | *((long*)dst)++ = lc; | 217 | *(long *) dst = lc; dst += 4; |
217 | *(short*)dst = (short) lc; | 218 | *(short *) dst = (short) lc; |
218 | break; | 219 | break; |
220 | |||
219 | case 11: | 221 | case 11: |
220 | *((long*)dst)++ = lc; | 222 | *(long *) dst = lc; dst += 4; |
221 | *((long*)dst)++ = lc; | 223 | *(long *) dst = lc; dst += 4; |
222 | *((short*)dst)++ = (short) lc; | 224 | *(short *) dst = (short) lc; dst += 2; |
223 | *(char*)dst = (char) lc; | 225 | *dst = (char) lc; |
224 | break; | 226 | break; |
227 | |||
225 | case 12: | 228 | case 12: |
226 | *((long*)dst)++ = lc; | 229 | *(long *) dst = lc; dst += 4; |
227 | *((long*)dst)++ = lc; | 230 | *(long *) dst = lc; dst += 4; |
228 | *((long*)dst)++ = lc; | 231 | *(long *) dst = lc; |
229 | break; | 232 | break; |
233 | |||
230 | case 13: | 234 | case 13: |
231 | *((long*)dst)++ = lc; | 235 | *(long *) dst = lc; dst += 4; |
232 | *((long*)dst)++ = lc; | 236 | *(long *) dst = lc; dst += 4; |
233 | *((long*)dst)++ = lc; | 237 | *(long *) dst = lc; dst += 4; |
234 | *(char*)dst = (char) lc; | 238 | *dst = (char) lc; |
235 | break; | 239 | break; |
240 | |||
236 | case 14: | 241 | case 14: |
237 | *((long*)dst)++ = lc; | 242 | *(long *) dst = lc; dst += 4; |
238 | *((long*)dst)++ = lc; | 243 | *(long *) dst = lc; dst += 4; |
239 | *((long*)dst)++ = lc; | 244 | *(long *) dst = lc; dst += 4; |
240 | *(short*)dst = (short) lc; | 245 | *(short *) dst = (short) lc; |
241 | break; | 246 | break; |
247 | |||
242 | case 15: | 248 | case 15: |
243 | *((long*)dst)++ = lc; | 249 | *(long *) dst = lc; dst += 4; |
244 | *((long*)dst)++ = lc; | 250 | *(long *) dst = lc; dst += 4; |
245 | *((long*)dst)++ = lc; | 251 | *(long *) dst = lc; dst += 4; |
246 | *((short*)dst)++ = (short) lc; | 252 | *(short *) dst = (short) lc; dst += 2; |
247 | *(char*)dst = (char) lc; | 253 | *dst = (char) lc; |
248 | break; | 254 | break; |
249 | } | 255 | } |
250 | } | 256 | } |
251 | 257 | ||
252 | return return_dst; /* destination pointer. */ | 258 | return return_dst; |
253 | } /* memset() */ | 259 | } |