diff options
Diffstat (limited to 'arch/mips/lib/memcpy.S')
-rw-r--r-- | arch/mips/lib/memcpy.S | 250 |
1 files changed, 147 insertions, 103 deletions
diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S index a526c62cb76a..c06cccf60bec 100644 --- a/arch/mips/lib/memcpy.S +++ b/arch/mips/lib/memcpy.S | |||
@@ -9,6 +9,7 @@ | |||
9 | * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. | 9 | * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. |
10 | * Copyright (C) 2002 Broadcom, Inc. | 10 | * Copyright (C) 2002 Broadcom, Inc. |
11 | * memcpy/copy_user author: Mark Vandevoorde | 11 | * memcpy/copy_user author: Mark Vandevoorde |
12 | * Copyright (C) 2007 Maciej W. Rozycki | ||
12 | * | 13 | * |
13 | * Mnemonic names for arguments to memcpy/__copy_user | 14 | * Mnemonic names for arguments to memcpy/__copy_user |
14 | */ | 15 | */ |
@@ -175,7 +176,11 @@ | |||
175 | 176 | ||
176 | .text | 177 | .text |
177 | .set noreorder | 178 | .set noreorder |
179 | #ifndef CONFIG_CPU_DADDI_WORKAROUNDS | ||
178 | .set noat | 180 | .set noat |
181 | #else | ||
182 | .set at=v1 | ||
183 | #endif | ||
179 | 184 | ||
180 | /* | 185 | /* |
181 | * A combined memcpy/__copy_user | 186 | * A combined memcpy/__copy_user |
@@ -186,7 +191,7 @@ | |||
186 | .align 5 | 191 | .align 5 |
187 | LEAF(memcpy) /* a0=dst a1=src a2=len */ | 192 | LEAF(memcpy) /* a0=dst a1=src a2=len */ |
188 | move v0, dst /* return value */ | 193 | move v0, dst /* return value */ |
189 | __memcpy: | 194 | .L__memcpy: |
190 | FEXPORT(__copy_user) | 195 | FEXPORT(__copy_user) |
191 | /* | 196 | /* |
192 | * Note: dst & src may be unaligned, len may be 0 | 197 | * Note: dst & src may be unaligned, len may be 0 |
@@ -194,6 +199,7 @@ FEXPORT(__copy_user) | |||
194 | */ | 199 | */ |
195 | #define rem t8 | 200 | #define rem t8 |
196 | 201 | ||
202 | R10KCBARRIER(0(ra)) | ||
197 | /* | 203 | /* |
198 | * The "issue break"s below are very approximate. | 204 | * The "issue break"s below are very approximate. |
199 | * Issue delays for dcache fills will perturb the schedule, as will | 205 | * Issue delays for dcache fills will perturb the schedule, as will |
@@ -207,44 +213,45 @@ FEXPORT(__copy_user) | |||
207 | and t1, dst, ADDRMASK | 213 | and t1, dst, ADDRMASK |
208 | PREF( 0, 1*32(src) ) | 214 | PREF( 0, 1*32(src) ) |
209 | PREF( 1, 1*32(dst) ) | 215 | PREF( 1, 1*32(dst) ) |
210 | bnez t2, copy_bytes_checklen | 216 | bnez t2, .Lcopy_bytes_checklen |
211 | and t0, src, ADDRMASK | 217 | and t0, src, ADDRMASK |
212 | PREF( 0, 2*32(src) ) | 218 | PREF( 0, 2*32(src) ) |
213 | PREF( 1, 2*32(dst) ) | 219 | PREF( 1, 2*32(dst) ) |
214 | bnez t1, dst_unaligned | 220 | bnez t1, .Ldst_unaligned |
215 | nop | 221 | nop |
216 | bnez t0, src_unaligned_dst_aligned | 222 | bnez t0, .Lsrc_unaligned_dst_aligned |
217 | /* | 223 | /* |
218 | * use delay slot for fall-through | 224 | * use delay slot for fall-through |
219 | * src and dst are aligned; need to compute rem | 225 | * src and dst are aligned; need to compute rem |
220 | */ | 226 | */ |
221 | both_aligned: | 227 | .Lboth_aligned: |
222 | SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter | 228 | SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter |
223 | beqz t0, cleanup_both_aligned # len < 8*NBYTES | 229 | beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES |
224 | and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES) | 230 | and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES) |
225 | PREF( 0, 3*32(src) ) | 231 | PREF( 0, 3*32(src) ) |
226 | PREF( 1, 3*32(dst) ) | 232 | PREF( 1, 3*32(dst) ) |
227 | .align 4 | 233 | .align 4 |
228 | 1: | 234 | 1: |
229 | EXC( LOAD t0, UNIT(0)(src), l_exc) | 235 | R10KCBARRIER(0(ra)) |
230 | EXC( LOAD t1, UNIT(1)(src), l_exc_copy) | 236 | EXC( LOAD t0, UNIT(0)(src), .Ll_exc) |
231 | EXC( LOAD t2, UNIT(2)(src), l_exc_copy) | 237 | EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy) |
232 | EXC( LOAD t3, UNIT(3)(src), l_exc_copy) | 238 | EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy) |
239 | EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy) | ||
233 | SUB len, len, 8*NBYTES | 240 | SUB len, len, 8*NBYTES |
234 | EXC( LOAD t4, UNIT(4)(src), l_exc_copy) | 241 | EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy) |
235 | EXC( LOAD t7, UNIT(5)(src), l_exc_copy) | 242 | EXC( LOAD t7, UNIT(5)(src), .Ll_exc_copy) |
236 | EXC( STORE t0, UNIT(0)(dst), s_exc_p8u) | 243 | EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p8u) |
237 | EXC( STORE t1, UNIT(1)(dst), s_exc_p7u) | 244 | EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p7u) |
238 | EXC( LOAD t0, UNIT(6)(src), l_exc_copy) | 245 | EXC( LOAD t0, UNIT(6)(src), .Ll_exc_copy) |
239 | EXC( LOAD t1, UNIT(7)(src), l_exc_copy) | 246 | EXC( LOAD t1, UNIT(7)(src), .Ll_exc_copy) |
240 | ADD src, src, 8*NBYTES | 247 | ADD src, src, 8*NBYTES |
241 | ADD dst, dst, 8*NBYTES | 248 | ADD dst, dst, 8*NBYTES |
242 | EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u) | 249 | EXC( STORE t2, UNIT(-6)(dst), .Ls_exc_p6u) |
243 | EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u) | 250 | EXC( STORE t3, UNIT(-5)(dst), .Ls_exc_p5u) |
244 | EXC( STORE t4, UNIT(-4)(dst), s_exc_p4u) | 251 | EXC( STORE t4, UNIT(-4)(dst), .Ls_exc_p4u) |
245 | EXC( STORE t7, UNIT(-3)(dst), s_exc_p3u) | 252 | EXC( STORE t7, UNIT(-3)(dst), .Ls_exc_p3u) |
246 | EXC( STORE t0, UNIT(-2)(dst), s_exc_p2u) | 253 | EXC( STORE t0, UNIT(-2)(dst), .Ls_exc_p2u) |
247 | EXC( STORE t1, UNIT(-1)(dst), s_exc_p1u) | 254 | EXC( STORE t1, UNIT(-1)(dst), .Ls_exc_p1u) |
248 | PREF( 0, 8*32(src) ) | 255 | PREF( 0, 8*32(src) ) |
249 | PREF( 1, 8*32(dst) ) | 256 | PREF( 1, 8*32(dst) ) |
250 | bne len, rem, 1b | 257 | bne len, rem, 1b |
@@ -253,39 +260,45 @@ EXC( STORE t1, UNIT(-1)(dst), s_exc_p1u) | |||
253 | /* | 260 | /* |
254 | * len == rem == the number of bytes left to copy < 8*NBYTES | 261 | * len == rem == the number of bytes left to copy < 8*NBYTES |
255 | */ | 262 | */ |
256 | cleanup_both_aligned: | 263 | .Lcleanup_both_aligned: |
257 | beqz len, done | 264 | beqz len, .Ldone |
258 | sltu t0, len, 4*NBYTES | 265 | sltu t0, len, 4*NBYTES |
259 | bnez t0, less_than_4units | 266 | bnez t0, .Lless_than_4units |
260 | and rem, len, (NBYTES-1) # rem = len % NBYTES | 267 | and rem, len, (NBYTES-1) # rem = len % NBYTES |
261 | /* | 268 | /* |
262 | * len >= 4*NBYTES | 269 | * len >= 4*NBYTES |
263 | */ | 270 | */ |
264 | EXC( LOAD t0, UNIT(0)(src), l_exc) | 271 | EXC( LOAD t0, UNIT(0)(src), .Ll_exc) |
265 | EXC( LOAD t1, UNIT(1)(src), l_exc_copy) | 272 | EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy) |
266 | EXC( LOAD t2, UNIT(2)(src), l_exc_copy) | 273 | EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy) |
267 | EXC( LOAD t3, UNIT(3)(src), l_exc_copy) | 274 | EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy) |
268 | SUB len, len, 4*NBYTES | 275 | SUB len, len, 4*NBYTES |
269 | ADD src, src, 4*NBYTES | 276 | ADD src, src, 4*NBYTES |
270 | EXC( STORE t0, UNIT(0)(dst), s_exc_p4u) | 277 | R10KCBARRIER(0(ra)) |
271 | EXC( STORE t1, UNIT(1)(dst), s_exc_p3u) | 278 | EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p4u) |
272 | EXC( STORE t2, UNIT(2)(dst), s_exc_p2u) | 279 | EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p3u) |
273 | EXC( STORE t3, UNIT(3)(dst), s_exc_p1u) | 280 | EXC( STORE t2, UNIT(2)(dst), .Ls_exc_p2u) |
274 | beqz len, done | 281 | EXC( STORE t3, UNIT(3)(dst), .Ls_exc_p1u) |
275 | ADD dst, dst, 4*NBYTES | 282 | .set reorder /* DADDI_WAR */ |
276 | less_than_4units: | 283 | ADD dst, dst, 4*NBYTES |
284 | beqz len, .Ldone | ||
285 | .set noreorder | ||
286 | .Lless_than_4units: | ||
277 | /* | 287 | /* |
278 | * rem = len % NBYTES | 288 | * rem = len % NBYTES |
279 | */ | 289 | */ |
280 | beq rem, len, copy_bytes | 290 | beq rem, len, .Lcopy_bytes |
281 | nop | 291 | nop |
282 | 1: | 292 | 1: |
283 | EXC( LOAD t0, 0(src), l_exc) | 293 | R10KCBARRIER(0(ra)) |
294 | EXC( LOAD t0, 0(src), .Ll_exc) | ||
284 | ADD src, src, NBYTES | 295 | ADD src, src, NBYTES |
285 | SUB len, len, NBYTES | 296 | SUB len, len, NBYTES |
286 | EXC( STORE t0, 0(dst), s_exc_p1u) | 297 | EXC( STORE t0, 0(dst), .Ls_exc_p1u) |
298 | .set reorder /* DADDI_WAR */ | ||
299 | ADD dst, dst, NBYTES | ||
287 | bne rem, len, 1b | 300 | bne rem, len, 1b |
288 | ADD dst, dst, NBYTES | 301 | .set noreorder |
289 | 302 | ||
290 | /* | 303 | /* |
291 | * src and dst are aligned, need to copy rem bytes (rem < NBYTES) | 304 | * src and dst are aligned, need to copy rem bytes (rem < NBYTES) |
@@ -299,17 +312,17 @@ EXC( STORE t0, 0(dst), s_exc_p1u) | |||
299 | * more instruction-level parallelism. | 312 | * more instruction-level parallelism. |
300 | */ | 313 | */ |
301 | #define bits t2 | 314 | #define bits t2 |
302 | beqz len, done | 315 | beqz len, .Ldone |
303 | ADD t1, dst, len # t1 is just past last byte of dst | 316 | ADD t1, dst, len # t1 is just past last byte of dst |
304 | li bits, 8*NBYTES | 317 | li bits, 8*NBYTES |
305 | SLL rem, len, 3 # rem = number of bits to keep | 318 | SLL rem, len, 3 # rem = number of bits to keep |
306 | EXC( LOAD t0, 0(src), l_exc) | 319 | EXC( LOAD t0, 0(src), .Ll_exc) |
307 | SUB bits, bits, rem # bits = number of bits to discard | 320 | SUB bits, bits, rem # bits = number of bits to discard |
308 | SHIFT_DISCARD t0, t0, bits | 321 | SHIFT_DISCARD t0, t0, bits |
309 | EXC( STREST t0, -1(t1), s_exc) | 322 | EXC( STREST t0, -1(t1), .Ls_exc) |
310 | jr ra | 323 | jr ra |
311 | move len, zero | 324 | move len, zero |
312 | dst_unaligned: | 325 | .Ldst_unaligned: |
313 | /* | 326 | /* |
314 | * dst is unaligned | 327 | * dst is unaligned |
315 | * t0 = src & ADDRMASK | 328 | * t0 = src & ADDRMASK |
@@ -320,22 +333,23 @@ dst_unaligned: | |||
320 | * Set match = (src and dst have same alignment) | 333 | * Set match = (src and dst have same alignment) |
321 | */ | 334 | */ |
322 | #define match rem | 335 | #define match rem |
323 | EXC( LDFIRST t3, FIRST(0)(src), l_exc) | 336 | EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc) |
324 | ADD t2, zero, NBYTES | 337 | ADD t2, zero, NBYTES |
325 | EXC( LDREST t3, REST(0)(src), l_exc_copy) | 338 | EXC( LDREST t3, REST(0)(src), .Ll_exc_copy) |
326 | SUB t2, t2, t1 # t2 = number of bytes copied | 339 | SUB t2, t2, t1 # t2 = number of bytes copied |
327 | xor match, t0, t1 | 340 | xor match, t0, t1 |
328 | EXC( STFIRST t3, FIRST(0)(dst), s_exc) | 341 | R10KCBARRIER(0(ra)) |
329 | beq len, t2, done | 342 | EXC( STFIRST t3, FIRST(0)(dst), .Ls_exc) |
343 | beq len, t2, .Ldone | ||
330 | SUB len, len, t2 | 344 | SUB len, len, t2 |
331 | ADD dst, dst, t2 | 345 | ADD dst, dst, t2 |
332 | beqz match, both_aligned | 346 | beqz match, .Lboth_aligned |
333 | ADD src, src, t2 | 347 | ADD src, src, t2 |
334 | 348 | ||
335 | src_unaligned_dst_aligned: | 349 | .Lsrc_unaligned_dst_aligned: |
336 | SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter | 350 | SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter |
337 | PREF( 0, 3*32(src) ) | 351 | PREF( 0, 3*32(src) ) |
338 | beqz t0, cleanup_src_unaligned | 352 | beqz t0, .Lcleanup_src_unaligned |
339 | and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES | 353 | and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES |
340 | PREF( 1, 3*32(dst) ) | 354 | PREF( 1, 3*32(dst) ) |
341 | 1: | 355 | 1: |
@@ -345,52 +359,59 @@ src_unaligned_dst_aligned: | |||
345 | * It's OK to load FIRST(N+1) before REST(N) because the two addresses | 359 | * It's OK to load FIRST(N+1) before REST(N) because the two addresses |
346 | * are to the same unit (unless src is aligned, but it's not). | 360 | * are to the same unit (unless src is aligned, but it's not). |
347 | */ | 361 | */ |
348 | EXC( LDFIRST t0, FIRST(0)(src), l_exc) | 362 | R10KCBARRIER(0(ra)) |
349 | EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) | 363 | EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc) |
364 | EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy) | ||
350 | SUB len, len, 4*NBYTES | 365 | SUB len, len, 4*NBYTES |
351 | EXC( LDREST t0, REST(0)(src), l_exc_copy) | 366 | EXC( LDREST t0, REST(0)(src), .Ll_exc_copy) |
352 | EXC( LDREST t1, REST(1)(src), l_exc_copy) | 367 | EXC( LDREST t1, REST(1)(src), .Ll_exc_copy) |
353 | EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) | 368 | EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy) |
354 | EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) | 369 | EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy) |
355 | EXC( LDREST t2, REST(2)(src), l_exc_copy) | 370 | EXC( LDREST t2, REST(2)(src), .Ll_exc_copy) |
356 | EXC( LDREST t3, REST(3)(src), l_exc_copy) | 371 | EXC( LDREST t3, REST(3)(src), .Ll_exc_copy) |
357 | PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed) | 372 | PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed) |
358 | ADD src, src, 4*NBYTES | 373 | ADD src, src, 4*NBYTES |
359 | #ifdef CONFIG_CPU_SB1 | 374 | #ifdef CONFIG_CPU_SB1 |
360 | nop # improves slotting | 375 | nop # improves slotting |
361 | #endif | 376 | #endif |
362 | EXC( STORE t0, UNIT(0)(dst), s_exc_p4u) | 377 | EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p4u) |
363 | EXC( STORE t1, UNIT(1)(dst), s_exc_p3u) | 378 | EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p3u) |
364 | EXC( STORE t2, UNIT(2)(dst), s_exc_p2u) | 379 | EXC( STORE t2, UNIT(2)(dst), .Ls_exc_p2u) |
365 | EXC( STORE t3, UNIT(3)(dst), s_exc_p1u) | 380 | EXC( STORE t3, UNIT(3)(dst), .Ls_exc_p1u) |
366 | PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed) | 381 | PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed) |
382 | .set reorder /* DADDI_WAR */ | ||
383 | ADD dst, dst, 4*NBYTES | ||
367 | bne len, rem, 1b | 384 | bne len, rem, 1b |
368 | ADD dst, dst, 4*NBYTES | 385 | .set noreorder |
369 | 386 | ||
370 | cleanup_src_unaligned: | 387 | .Lcleanup_src_unaligned: |
371 | beqz len, done | 388 | beqz len, .Ldone |
372 | and rem, len, NBYTES-1 # rem = len % NBYTES | 389 | and rem, len, NBYTES-1 # rem = len % NBYTES |
373 | beq rem, len, copy_bytes | 390 | beq rem, len, .Lcopy_bytes |
374 | nop | 391 | nop |
375 | 1: | 392 | 1: |
376 | EXC( LDFIRST t0, FIRST(0)(src), l_exc) | 393 | R10KCBARRIER(0(ra)) |
377 | EXC( LDREST t0, REST(0)(src), l_exc_copy) | 394 | EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc) |
395 | EXC( LDREST t0, REST(0)(src), .Ll_exc_copy) | ||
378 | ADD src, src, NBYTES | 396 | ADD src, src, NBYTES |
379 | SUB len, len, NBYTES | 397 | SUB len, len, NBYTES |
380 | EXC( STORE t0, 0(dst), s_exc_p1u) | 398 | EXC( STORE t0, 0(dst), .Ls_exc_p1u) |
399 | .set reorder /* DADDI_WAR */ | ||
400 | ADD dst, dst, NBYTES | ||
381 | bne len, rem, 1b | 401 | bne len, rem, 1b |
382 | ADD dst, dst, NBYTES | 402 | .set noreorder |
383 | 403 | ||
384 | copy_bytes_checklen: | 404 | .Lcopy_bytes_checklen: |
385 | beqz len, done | 405 | beqz len, .Ldone |
386 | nop | 406 | nop |
387 | copy_bytes: | 407 | .Lcopy_bytes: |
388 | /* 0 < len < NBYTES */ | 408 | /* 0 < len < NBYTES */ |
409 | R10KCBARRIER(0(ra)) | ||
389 | #define COPY_BYTE(N) \ | 410 | #define COPY_BYTE(N) \ |
390 | EXC( lb t0, N(src), l_exc); \ | 411 | EXC( lb t0, N(src), .Ll_exc); \ |
391 | SUB len, len, 1; \ | 412 | SUB len, len, 1; \ |
392 | beqz len, done; \ | 413 | beqz len, .Ldone; \ |
393 | EXC( sb t0, N(dst), s_exc_p1) | 414 | EXC( sb t0, N(dst), .Ls_exc_p1) |
394 | 415 | ||
395 | COPY_BYTE(0) | 416 | COPY_BYTE(0) |
396 | COPY_BYTE(1) | 417 | COPY_BYTE(1) |
@@ -400,16 +421,16 @@ EXC( sb t0, N(dst), s_exc_p1) | |||
400 | COPY_BYTE(4) | 421 | COPY_BYTE(4) |
401 | COPY_BYTE(5) | 422 | COPY_BYTE(5) |
402 | #endif | 423 | #endif |
403 | EXC( lb t0, NBYTES-2(src), l_exc) | 424 | EXC( lb t0, NBYTES-2(src), .Ll_exc) |
404 | SUB len, len, 1 | 425 | SUB len, len, 1 |
405 | jr ra | 426 | jr ra |
406 | EXC( sb t0, NBYTES-2(dst), s_exc_p1) | 427 | EXC( sb t0, NBYTES-2(dst), .Ls_exc_p1) |
407 | done: | 428 | .Ldone: |
408 | jr ra | 429 | jr ra |
409 | nop | 430 | nop |
410 | END(memcpy) | 431 | END(memcpy) |
411 | 432 | ||
412 | l_exc_copy: | 433 | .Ll_exc_copy: |
413 | /* | 434 | /* |
414 | * Copy bytes from src until faulting load address (or until a | 435 | * Copy bytes from src until faulting load address (or until a |
415 | * lb faults) | 436 | * lb faults) |
@@ -424,12 +445,14 @@ l_exc_copy: | |||
424 | nop | 445 | nop |
425 | LOAD t0, THREAD_BUADDR(t0) | 446 | LOAD t0, THREAD_BUADDR(t0) |
426 | 1: | 447 | 1: |
427 | EXC( lb t1, 0(src), l_exc) | 448 | EXC( lb t1, 0(src), .Ll_exc) |
428 | ADD src, src, 1 | 449 | ADD src, src, 1 |
429 | sb t1, 0(dst) # can't fault -- we're copy_from_user | 450 | sb t1, 0(dst) # can't fault -- we're copy_from_user |
451 | .set reorder /* DADDI_WAR */ | ||
452 | ADD dst, dst, 1 | ||
430 | bne src, t0, 1b | 453 | bne src, t0, 1b |
431 | ADD dst, dst, 1 | 454 | .set noreorder |
432 | l_exc: | 455 | .Ll_exc: |
433 | LOAD t0, TI_TASK($28) | 456 | LOAD t0, TI_TASK($28) |
434 | nop | 457 | nop |
435 | LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address | 458 | LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address |
@@ -446,20 +469,33 @@ l_exc: | |||
446 | * Clear len bytes starting at dst. Can't call __bzero because it | 469 | * Clear len bytes starting at dst. Can't call __bzero because it |
447 | * might modify len. An inefficient loop for these rare times... | 470 | * might modify len. An inefficient loop for these rare times... |
448 | */ | 471 | */ |
449 | beqz len, done | 472 | .set reorder /* DADDI_WAR */ |
450 | SUB src, len, 1 | 473 | SUB src, len, 1 |
474 | beqz len, .Ldone | ||
475 | .set noreorder | ||
451 | 1: sb zero, 0(dst) | 476 | 1: sb zero, 0(dst) |
452 | ADD dst, dst, 1 | 477 | ADD dst, dst, 1 |
478 | #ifndef CONFIG_CPU_DADDI_WORKAROUNDS | ||
453 | bnez src, 1b | 479 | bnez src, 1b |
454 | SUB src, src, 1 | 480 | SUB src, src, 1 |
481 | #else | ||
482 | .set push | ||
483 | .set noat | ||
484 | li v1, 1 | ||
485 | bnez src, 1b | ||
486 | SUB src, src, v1 | ||
487 | .set pop | ||
488 | #endif | ||
455 | jr ra | 489 | jr ra |
456 | nop | 490 | nop |
457 | 491 | ||
458 | 492 | ||
459 | #define SEXC(n) \ | 493 | #define SEXC(n) \ |
460 | s_exc_p ## n ## u: \ | 494 | .set reorder; /* DADDI_WAR */ \ |
461 | jr ra; \ | 495 | .Ls_exc_p ## n ## u: \ |
462 | ADD len, len, n*NBYTES | 496 | ADD len, len, n*NBYTES; \ |
497 | jr ra; \ | ||
498 | .set noreorder | ||
463 | 499 | ||
464 | SEXC(8) | 500 | SEXC(8) |
465 | SEXC(7) | 501 | SEXC(7) |
@@ -470,10 +506,12 @@ SEXC(3) | |||
470 | SEXC(2) | 506 | SEXC(2) |
471 | SEXC(1) | 507 | SEXC(1) |
472 | 508 | ||
473 | s_exc_p1: | 509 | .Ls_exc_p1: |
510 | .set reorder /* DADDI_WAR */ | ||
511 | ADD len, len, 1 | ||
474 | jr ra | 512 | jr ra |
475 | ADD len, len, 1 | 513 | .set noreorder |
476 | s_exc: | 514 | .Ls_exc: |
477 | jr ra | 515 | jr ra |
478 | nop | 516 | nop |
479 | 517 | ||
@@ -484,38 +522,44 @@ LEAF(memmove) | |||
484 | sltu t0, a1, t0 # dst + len <= src -> memcpy | 522 | sltu t0, a1, t0 # dst + len <= src -> memcpy |
485 | sltu t1, a0, t1 # dst >= src + len -> memcpy | 523 | sltu t1, a0, t1 # dst >= src + len -> memcpy |
486 | and t0, t1 | 524 | and t0, t1 |
487 | beqz t0, __memcpy | 525 | beqz t0, .L__memcpy |
488 | move v0, a0 /* return value */ | 526 | move v0, a0 /* return value */ |
489 | beqz a2, r_out | 527 | beqz a2, .Lr_out |
490 | END(memmove) | 528 | END(memmove) |
491 | 529 | ||
492 | /* fall through to __rmemcpy */ | 530 | /* fall through to __rmemcpy */ |
493 | LEAF(__rmemcpy) /* a0=dst a1=src a2=len */ | 531 | LEAF(__rmemcpy) /* a0=dst a1=src a2=len */ |
494 | sltu t0, a1, a0 | 532 | sltu t0, a1, a0 |
495 | beqz t0, r_end_bytes_up # src >= dst | 533 | beqz t0, .Lr_end_bytes_up # src >= dst |
496 | nop | 534 | nop |
497 | ADD a0, a2 # dst = dst + len | 535 | ADD a0, a2 # dst = dst + len |
498 | ADD a1, a2 # src = src + len | 536 | ADD a1, a2 # src = src + len |
499 | 537 | ||
500 | r_end_bytes: | 538 | .Lr_end_bytes: |
539 | R10KCBARRIER(0(ra)) | ||
501 | lb t0, -1(a1) | 540 | lb t0, -1(a1) |
502 | SUB a2, a2, 0x1 | 541 | SUB a2, a2, 0x1 |
503 | sb t0, -1(a0) | 542 | sb t0, -1(a0) |
504 | SUB a1, a1, 0x1 | 543 | SUB a1, a1, 0x1 |
505 | bnez a2, r_end_bytes | 544 | .set reorder /* DADDI_WAR */ |
506 | SUB a0, a0, 0x1 | 545 | SUB a0, a0, 0x1 |
546 | bnez a2, .Lr_end_bytes | ||
547 | .set noreorder | ||
507 | 548 | ||
508 | r_out: | 549 | .Lr_out: |
509 | jr ra | 550 | jr ra |
510 | move a2, zero | 551 | move a2, zero |
511 | 552 | ||
512 | r_end_bytes_up: | 553 | .Lr_end_bytes_up: |
554 | R10KCBARRIER(0(ra)) | ||
513 | lb t0, (a1) | 555 | lb t0, (a1) |
514 | SUB a2, a2, 0x1 | 556 | SUB a2, a2, 0x1 |
515 | sb t0, (a0) | 557 | sb t0, (a0) |
516 | ADD a1, a1, 0x1 | 558 | ADD a1, a1, 0x1 |
517 | bnez a2, r_end_bytes_up | 559 | .set reorder /* DADDI_WAR */ |
518 | ADD a0, a0, 0x1 | 560 | ADD a0, a0, 0x1 |
561 | bnez a2, .Lr_end_bytes_up | ||
562 | .set noreorder | ||
519 | 563 | ||
520 | jr ra | 564 | jr ra |
521 | move a2, zero | 565 | move a2, zero |