diff options
Diffstat (limited to 'arch/cris/arch-v32/lib/memset.c')
-rw-r--r-- | arch/cris/arch-v32/lib/memset.c | 253 |
1 files changed, 253 insertions, 0 deletions
diff --git a/arch/cris/arch-v32/lib/memset.c b/arch/cris/arch-v32/lib/memset.c new file mode 100644 index 000000000000..ffca1214674e --- /dev/null +++ b/arch/cris/arch-v32/lib/memset.c | |||
@@ -0,0 +1,253 @@ | |||
1 | /*#************************************************************************#*/ | ||
2 | /*#-------------------------------------------------------------------------*/ | ||
3 | /*# */ | ||
4 | /*# FUNCTION NAME: memset() */ | ||
5 | /*# */ | ||
6 | /*# PARAMETERS: void* dst; Destination address. */ | ||
7 | /*# int c; Value of byte to write. */ | ||
8 | /*# int len; Number of bytes to write. */ | ||
9 | /*# */ | ||
10 | /*# RETURNS: dst. */ | ||
11 | /*# */ | ||
12 | /*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */ | ||
13 | /*# Framework taken from memcpy. This routine is */ | ||
14 | /*# very sensitive to compiler changes in register allocation. */ | ||
15 | /*# Should really be rewritten to avoid this problem. */ | ||
16 | /*# */ | ||
17 | /*#-------------------------------------------------------------------------*/ | ||
18 | /*# */ | ||
19 | /*# HISTORY */ | ||
20 | /*# */ | ||
21 | /*# DATE NAME CHANGES */ | ||
22 | /*# ---- ---- ------- */ | ||
23 | /*# 990713 HP Tired of watching this function (or */ | ||
24 | /*# really, the nonoptimized generic */ | ||
25 | /*# implementation) take up 90% of simulator */ | ||
26 | /*# output. Measurements needed. */ | ||
27 | /*# */ | ||
28 | /*#-------------------------------------------------------------------------*/ | ||
29 | |||
30 | #include <linux/types.h> | ||
31 | |||
32 | /* No, there's no macro saying 12*4, since it is "hard" to get it into | ||
33 | the asm in a good way. Thus better to expose the problem everywhere. | ||
34 | */ | ||
35 | |||
36 | /* Assuming 1 cycle per dword written or read (ok, not really true), and | ||
37 | one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1) | ||
38 | so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */ | ||
39 | |||
40 | #define ZERO_BLOCK_SIZE (1*12*4) | ||
41 | |||
42 | void *memset(void *pdst, | ||
43 | int c, | ||
44 | size_t plen) | ||
45 | { | ||
46 | /* Ok. Now we want the parameters put in special registers. | ||
47 | Make sure the compiler is able to make something useful of this. */ | ||
48 | |||
49 | register char *return_dst __asm__ ("r10") = pdst; | ||
50 | register int n __asm__ ("r12") = plen; | ||
51 | register int lc __asm__ ("r11") = c; | ||
52 | |||
53 | /* Most apps use memset sanely. Only those memsetting about 3..4 | ||
54 | bytes or less get penalized compared to the generic implementation | ||
55 | - and that's not really sane use. */ | ||
56 | |||
57 | /* Ugh. This is fragile at best. Check with newer GCC releases, if | ||
58 | they compile cascaded "x |= x << 8" sanely! */ | ||
59 | __asm__("movu.b %0,$r13 \n\ | ||
60 | lslq 8,$r13 \n\ | ||
61 | move.b %0,$r13 \n\ | ||
62 | move.d $r13,%0 \n\ | ||
63 | lslq 16,$r13 \n\ | ||
64 | or.d $r13,%0" | ||
65 | : "=r" (lc) : "0" (lc) : "r13"); | ||
66 | |||
67 | { | ||
68 | register char *dst __asm__ ("r13") = pdst; | ||
69 | |||
70 | /* This is NONPORTABLE, but since this whole routine is */ | ||
71 | /* grossly nonportable that doesn't matter. */ | ||
72 | |||
73 | if (((unsigned long) pdst & 3) != 0 | ||
74 | /* Oops! n=0 must be a legal call, regardless of alignment. */ | ||
75 | && n >= 3) | ||
76 | { | ||
77 | if ((unsigned long)dst & 1) | ||
78 | { | ||
79 | *dst = (char) lc; | ||
80 | n--; | ||
81 | dst++; | ||
82 | } | ||
83 | |||
84 | if ((unsigned long)dst & 2) | ||
85 | { | ||
86 | *(short *)dst = lc; | ||
87 | n -= 2; | ||
88 | dst += 2; | ||
89 | } | ||
90 | } | ||
91 | |||
92 | /* Now the fun part. For the threshold value of this, check the equation | ||
93 | above. */ | ||
94 | /* Decide which copying method to use. */ | ||
95 | if (n >= ZERO_BLOCK_SIZE) | ||
96 | { | ||
97 | /* For large copies we use 'movem' */ | ||
98 | |||
99 | /* It is not optimal to tell the compiler about clobbering any | ||
100 | registers; that will move the saving/restoring of those registers | ||
101 | to the function prologue/epilogue, and make non-movem sizes | ||
102 | suboptimal. | ||
103 | |||
104 | This method is not foolproof; it assumes that the "asm reg" | ||
105 | declarations at the beginning of the function really are used | ||
106 | here (beware: they may be moved to temporary registers). | ||
107 | This way, we do not have to save/move the registers around into | ||
108 | temporaries; we can safely use them straight away. | ||
109 | |||
110 | If you want to check that the allocation was right; then | ||
111 | check the equalities in the first comment. It should say | ||
112 | "r13=r13, r12=r12, r11=r11" */ | ||
113 | __asm__ volatile (" \n\ | ||
114 | ;; Check that the register asm declaration got right. \n\ | ||
115 | ;; The GCC manual says it will work, but there *has* been bugs. \n\ | ||
116 | .ifnc %0-%1-%4,$r13-$r12-$r11 \n\ | ||
117 | .err \n\ | ||
118 | .endif \n\ | ||
119 | \n\ | ||
120 | ;; Save the registers we'll clobber in the movem process \n\ | ||
121 | ;; on the stack. Don't mention them to gcc, it will only be \n\ | ||
122 | ;; upset. \n\ | ||
123 | subq 11*4,$sp \n\ | ||
124 | movem $r10,[$sp] \n\ | ||
125 | \n\ | ||
126 | move.d $r11,$r0 \n\ | ||
127 | move.d $r11,$r1 \n\ | ||
128 | move.d $r11,$r2 \n\ | ||
129 | move.d $r11,$r3 \n\ | ||
130 | move.d $r11,$r4 \n\ | ||
131 | move.d $r11,$r5 \n\ | ||
132 | move.d $r11,$r6 \n\ | ||
133 | move.d $r11,$r7 \n\ | ||
134 | move.d $r11,$r8 \n\ | ||
135 | move.d $r11,$r9 \n\ | ||
136 | move.d $r11,$r10 \n\ | ||
137 | \n\ | ||
138 | ;; Now we've got this: \n\ | ||
139 | ;; r13 - dst \n\ | ||
140 | ;; r12 - n \n\ | ||
141 | \n\ | ||
142 | ;; Update n for the first loop \n\ | ||
143 | subq 12*4,$r12 \n\ | ||
144 | 0: \n\ | ||
145 | subq 12*4,$r12 \n\ | ||
146 | bge 0b \n\ | ||
147 | movem $r11,[$r13+] \n\ | ||
148 | \n\ | ||
149 | addq 12*4,$r12 ;; compensate for last loop underflowing n \n\ | ||
150 | \n\ | ||
151 | ;; Restore registers from stack \n\ | ||
152 | movem [$sp+],$r10" | ||
153 | |||
154 | /* Outputs */ : "=r" (dst), "=r" (n) | ||
155 | /* Inputs */ : "0" (dst), "1" (n), "r" (lc)); | ||
156 | } | ||
157 | |||
158 | /* Either we directly starts copying, using dword copying | ||
159 | in a loop, or we copy as much as possible with 'movem' | ||
160 | and then the last block (<44 bytes) is copied here. | ||
161 | This will work since 'movem' will have updated src,dst,n. */ | ||
162 | |||
163 | while ( n >= 16 ) | ||
164 | { | ||
165 | *((long*)dst)++ = lc; | ||
166 | *((long*)dst)++ = lc; | ||
167 | *((long*)dst)++ = lc; | ||
168 | *((long*)dst)++ = lc; | ||
169 | n -= 16; | ||
170 | } | ||
171 | |||
172 | /* A switch() is definitely the fastest although it takes a LOT of code. | ||
173 | * Particularly if you inline code this. | ||
174 | */ | ||
175 | switch (n) | ||
176 | { | ||
177 | case 0: | ||
178 | break; | ||
179 | case 1: | ||
180 | *(char*)dst = (char) lc; | ||
181 | break; | ||
182 | case 2: | ||
183 | *(short*)dst = (short) lc; | ||
184 | break; | ||
185 | case 3: | ||
186 | *((short*)dst)++ = (short) lc; | ||
187 | *(char*)dst = (char) lc; | ||
188 | break; | ||
189 | case 4: | ||
190 | *((long*)dst)++ = lc; | ||
191 | break; | ||
192 | case 5: | ||
193 | *((long*)dst)++ = lc; | ||
194 | *(char*)dst = (char) lc; | ||
195 | break; | ||
196 | case 6: | ||
197 | *((long*)dst)++ = lc; | ||
198 | *(short*)dst = (short) lc; | ||
199 | break; | ||
200 | case 7: | ||
201 | *((long*)dst)++ = lc; | ||
202 | *((short*)dst)++ = (short) lc; | ||
203 | *(char*)dst = (char) lc; | ||
204 | break; | ||
205 | case 8: | ||
206 | *((long*)dst)++ = lc; | ||
207 | *((long*)dst)++ = lc; | ||
208 | break; | ||
209 | case 9: | ||
210 | *((long*)dst)++ = lc; | ||
211 | *((long*)dst)++ = lc; | ||
212 | *(char*)dst = (char) lc; | ||
213 | break; | ||
214 | case 10: | ||
215 | *((long*)dst)++ = lc; | ||
216 | *((long*)dst)++ = lc; | ||
217 | *(short*)dst = (short) lc; | ||
218 | break; | ||
219 | case 11: | ||
220 | *((long*)dst)++ = lc; | ||
221 | *((long*)dst)++ = lc; | ||
222 | *((short*)dst)++ = (short) lc; | ||
223 | *(char*)dst = (char) lc; | ||
224 | break; | ||
225 | case 12: | ||
226 | *((long*)dst)++ = lc; | ||
227 | *((long*)dst)++ = lc; | ||
228 | *((long*)dst)++ = lc; | ||
229 | break; | ||
230 | case 13: | ||
231 | *((long*)dst)++ = lc; | ||
232 | *((long*)dst)++ = lc; | ||
233 | *((long*)dst)++ = lc; | ||
234 | *(char*)dst = (char) lc; | ||
235 | break; | ||
236 | case 14: | ||
237 | *((long*)dst)++ = lc; | ||
238 | *((long*)dst)++ = lc; | ||
239 | *((long*)dst)++ = lc; | ||
240 | *(short*)dst = (short) lc; | ||
241 | break; | ||
242 | case 15: | ||
243 | *((long*)dst)++ = lc; | ||
244 | *((long*)dst)++ = lc; | ||
245 | *((long*)dst)++ = lc; | ||
246 | *((short*)dst)++ = (short) lc; | ||
247 | *(char*)dst = (char) lc; | ||
248 | break; | ||
249 | } | ||
250 | } | ||
251 | |||
252 | return return_dst; /* destination pointer. */ | ||
253 | } /* memset() */ | ||