diff options
Diffstat (limited to 'arch/mips/lib/memcpy-inatomic.S')
-rw-r--r-- | arch/mips/lib/memcpy-inatomic.S | 141 |
1 files changed, 78 insertions, 63 deletions
diff --git a/arch/mips/lib/memcpy-inatomic.S b/arch/mips/lib/memcpy-inatomic.S index 3a534b2baa0f..736d0fb56a94 100644 --- a/arch/mips/lib/memcpy-inatomic.S +++ b/arch/mips/lib/memcpy-inatomic.S | |||
@@ -9,6 +9,7 @@ | |||
9 | * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. | 9 | * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. |
10 | * Copyright (C) 2002 Broadcom, Inc. | 10 | * Copyright (C) 2002 Broadcom, Inc. |
11 | * memcpy/copy_user author: Mark Vandevoorde | 11 | * memcpy/copy_user author: Mark Vandevoorde |
12 | * Copyright (C) 2007 Maciej W. Rozycki | ||
12 | * | 13 | * |
13 | * Mnemonic names for arguments to memcpy/__copy_user | 14 | * Mnemonic names for arguments to memcpy/__copy_user |
14 | */ | 15 | */ |
@@ -175,7 +176,11 @@ | |||
175 | 176 | ||
176 | .text | 177 | .text |
177 | .set noreorder | 178 | .set noreorder |
179 | #ifndef CONFIG_CPU_DADDI_WORKAROUNDS | ||
178 | .set noat | 180 | .set noat |
181 | #else | ||
182 | .set at=v1 | ||
183 | #endif | ||
179 | 184 | ||
180 | /* | 185 | /* |
181 | * A combined memcpy/__copy_user | 186 | * A combined memcpy/__copy_user |
@@ -204,36 +209,36 @@ LEAF(__copy_user_inatomic) | |||
204 | and t1, dst, ADDRMASK | 209 | and t1, dst, ADDRMASK |
205 | PREF( 0, 1*32(src) ) | 210 | PREF( 0, 1*32(src) ) |
206 | PREF( 1, 1*32(dst) ) | 211 | PREF( 1, 1*32(dst) ) |
207 | bnez t2, copy_bytes_checklen | 212 | bnez t2, .Lcopy_bytes_checklen |
208 | and t0, src, ADDRMASK | 213 | and t0, src, ADDRMASK |
209 | PREF( 0, 2*32(src) ) | 214 | PREF( 0, 2*32(src) ) |
210 | PREF( 1, 2*32(dst) ) | 215 | PREF( 1, 2*32(dst) ) |
211 | bnez t1, dst_unaligned | 216 | bnez t1, .Ldst_unaligned |
212 | nop | 217 | nop |
213 | bnez t0, src_unaligned_dst_aligned | 218 | bnez t0, .Lsrc_unaligned_dst_aligned |
214 | /* | 219 | /* |
215 | * use delay slot for fall-through | 220 | * use delay slot for fall-through |
216 | * src and dst are aligned; need to compute rem | 221 | * src and dst are aligned; need to compute rem |
217 | */ | 222 | */ |
218 | both_aligned: | 223 | .Lboth_aligned: |
219 | SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter | 224 | SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter |
220 | beqz t0, cleanup_both_aligned # len < 8*NBYTES | 225 | beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES |
221 | and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES) | 226 | and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES) |
222 | PREF( 0, 3*32(src) ) | 227 | PREF( 0, 3*32(src) ) |
223 | PREF( 1, 3*32(dst) ) | 228 | PREF( 1, 3*32(dst) ) |
224 | .align 4 | 229 | .align 4 |
225 | 1: | 230 | 1: |
226 | EXC( LOAD t0, UNIT(0)(src), l_exc) | 231 | EXC( LOAD t0, UNIT(0)(src), .Ll_exc) |
227 | EXC( LOAD t1, UNIT(1)(src), l_exc_copy) | 232 | EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy) |
228 | EXC( LOAD t2, UNIT(2)(src), l_exc_copy) | 233 | EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy) |
229 | EXC( LOAD t3, UNIT(3)(src), l_exc_copy) | 234 | EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy) |
230 | SUB len, len, 8*NBYTES | 235 | SUB len, len, 8*NBYTES |
231 | EXC( LOAD t4, UNIT(4)(src), l_exc_copy) | 236 | EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy) |
232 | EXC( LOAD t7, UNIT(5)(src), l_exc_copy) | 237 | EXC( LOAD t7, UNIT(5)(src), .Ll_exc_copy) |
233 | STORE t0, UNIT(0)(dst) | 238 | STORE t0, UNIT(0)(dst) |
234 | STORE t1, UNIT(1)(dst) | 239 | STORE t1, UNIT(1)(dst) |
235 | EXC( LOAD t0, UNIT(6)(src), l_exc_copy) | 240 | EXC( LOAD t0, UNIT(6)(src), .Ll_exc_copy) |
236 | EXC( LOAD t1, UNIT(7)(src), l_exc_copy) | 241 | EXC( LOAD t1, UNIT(7)(src), .Ll_exc_copy) |
237 | ADD src, src, 8*NBYTES | 242 | ADD src, src, 8*NBYTES |
238 | ADD dst, dst, 8*NBYTES | 243 | ADD dst, dst, 8*NBYTES |
239 | STORE t2, UNIT(-6)(dst) | 244 | STORE t2, UNIT(-6)(dst) |
@@ -250,39 +255,43 @@ EXC( LOAD t1, UNIT(7)(src), l_exc_copy) | |||
250 | /* | 255 | /* |
251 | * len == rem == the number of bytes left to copy < 8*NBYTES | 256 | * len == rem == the number of bytes left to copy < 8*NBYTES |
252 | */ | 257 | */ |
253 | cleanup_both_aligned: | 258 | .Lcleanup_both_aligned: |
254 | beqz len, done | 259 | beqz len, .Ldone |
255 | sltu t0, len, 4*NBYTES | 260 | sltu t0, len, 4*NBYTES |
256 | bnez t0, less_than_4units | 261 | bnez t0, .Lless_than_4units |
257 | and rem, len, (NBYTES-1) # rem = len % NBYTES | 262 | and rem, len, (NBYTES-1) # rem = len % NBYTES |
258 | /* | 263 | /* |
259 | * len >= 4*NBYTES | 264 | * len >= 4*NBYTES |
260 | */ | 265 | */ |
261 | EXC( LOAD t0, UNIT(0)(src), l_exc) | 266 | EXC( LOAD t0, UNIT(0)(src), .Ll_exc) |
262 | EXC( LOAD t1, UNIT(1)(src), l_exc_copy) | 267 | EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy) |
263 | EXC( LOAD t2, UNIT(2)(src), l_exc_copy) | 268 | EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy) |
264 | EXC( LOAD t3, UNIT(3)(src), l_exc_copy) | 269 | EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy) |
265 | SUB len, len, 4*NBYTES | 270 | SUB len, len, 4*NBYTES |
266 | ADD src, src, 4*NBYTES | 271 | ADD src, src, 4*NBYTES |
267 | STORE t0, UNIT(0)(dst) | 272 | STORE t0, UNIT(0)(dst) |
268 | STORE t1, UNIT(1)(dst) | 273 | STORE t1, UNIT(1)(dst) |
269 | STORE t2, UNIT(2)(dst) | 274 | STORE t2, UNIT(2)(dst) |
270 | STORE t3, UNIT(3)(dst) | 275 | STORE t3, UNIT(3)(dst) |
271 | beqz len, done | 276 | .set reorder /* DADDI_WAR */ |
272 | ADD dst, dst, 4*NBYTES | 277 | ADD dst, dst, 4*NBYTES |
273 | less_than_4units: | 278 | beqz len, .Ldone |
279 | .set noreorder | ||
280 | .Lless_than_4units: | ||
274 | /* | 281 | /* |
275 | * rem = len % NBYTES | 282 | * rem = len % NBYTES |
276 | */ | 283 | */ |
277 | beq rem, len, copy_bytes | 284 | beq rem, len, .Lcopy_bytes |
278 | nop | 285 | nop |
279 | 1: | 286 | 1: |
280 | EXC( LOAD t0, 0(src), l_exc) | 287 | EXC( LOAD t0, 0(src), .Ll_exc) |
281 | ADD src, src, NBYTES | 288 | ADD src, src, NBYTES |
282 | SUB len, len, NBYTES | 289 | SUB len, len, NBYTES |
283 | STORE t0, 0(dst) | 290 | STORE t0, 0(dst) |
291 | .set reorder /* DADDI_WAR */ | ||
292 | ADD dst, dst, NBYTES | ||
284 | bne rem, len, 1b | 293 | bne rem, len, 1b |
285 | ADD dst, dst, NBYTES | 294 | .set noreorder |
286 | 295 | ||
287 | /* | 296 | /* |
288 | * src and dst are aligned, need to copy rem bytes (rem < NBYTES) | 297 | * src and dst are aligned, need to copy rem bytes (rem < NBYTES) |
@@ -296,17 +305,17 @@ EXC( LOAD t0, 0(src), l_exc) | |||
296 | * more instruction-level parallelism. | 305 | * more instruction-level parallelism. |
297 | */ | 306 | */ |
298 | #define bits t2 | 307 | #define bits t2 |
299 | beqz len, done | 308 | beqz len, .Ldone |
300 | ADD t1, dst, len # t1 is just past last byte of dst | 309 | ADD t1, dst, len # t1 is just past last byte of dst |
301 | li bits, 8*NBYTES | 310 | li bits, 8*NBYTES |
302 | SLL rem, len, 3 # rem = number of bits to keep | 311 | SLL rem, len, 3 # rem = number of bits to keep |
303 | EXC( LOAD t0, 0(src), l_exc) | 312 | EXC( LOAD t0, 0(src), .Ll_exc) |
304 | SUB bits, bits, rem # bits = number of bits to discard | 313 | SUB bits, bits, rem # bits = number of bits to discard |
305 | SHIFT_DISCARD t0, t0, bits | 314 | SHIFT_DISCARD t0, t0, bits |
306 | STREST t0, -1(t1) | 315 | STREST t0, -1(t1) |
307 | jr ra | 316 | jr ra |
308 | move len, zero | 317 | move len, zero |
309 | dst_unaligned: | 318 | .Ldst_unaligned: |
310 | /* | 319 | /* |
311 | * dst is unaligned | 320 | * dst is unaligned |
312 | * t0 = src & ADDRMASK | 321 | * t0 = src & ADDRMASK |
@@ -317,22 +326,22 @@ dst_unaligned: | |||
317 | * Set match = (src and dst have same alignment) | 326 | * Set match = (src and dst have same alignment) |
318 | */ | 327 | */ |
319 | #define match rem | 328 | #define match rem |
320 | EXC( LDFIRST t3, FIRST(0)(src), l_exc) | 329 | EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc) |
321 | ADD t2, zero, NBYTES | 330 | ADD t2, zero, NBYTES |
322 | EXC( LDREST t3, REST(0)(src), l_exc_copy) | 331 | EXC( LDREST t3, REST(0)(src), .Ll_exc_copy) |
323 | SUB t2, t2, t1 # t2 = number of bytes copied | 332 | SUB t2, t2, t1 # t2 = number of bytes copied |
324 | xor match, t0, t1 | 333 | xor match, t0, t1 |
325 | STFIRST t3, FIRST(0)(dst) | 334 | STFIRST t3, FIRST(0)(dst) |
326 | beq len, t2, done | 335 | beq len, t2, .Ldone |
327 | SUB len, len, t2 | 336 | SUB len, len, t2 |
328 | ADD dst, dst, t2 | 337 | ADD dst, dst, t2 |
329 | beqz match, both_aligned | 338 | beqz match, .Lboth_aligned |
330 | ADD src, src, t2 | 339 | ADD src, src, t2 |
331 | 340 | ||
332 | src_unaligned_dst_aligned: | 341 | .Lsrc_unaligned_dst_aligned: |
333 | SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter | 342 | SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter |
334 | PREF( 0, 3*32(src) ) | 343 | PREF( 0, 3*32(src) ) |
335 | beqz t0, cleanup_src_unaligned | 344 | beqz t0, .Lcleanup_src_unaligned |
336 | and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES | 345 | and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES |
337 | PREF( 1, 3*32(dst) ) | 346 | PREF( 1, 3*32(dst) ) |
338 | 1: | 347 | 1: |
@@ -342,15 +351,15 @@ src_unaligned_dst_aligned: | |||
342 | * It's OK to load FIRST(N+1) before REST(N) because the two addresses | 351 | * It's OK to load FIRST(N+1) before REST(N) because the two addresses |
343 | * are to the same unit (unless src is aligned, but it's not). | 352 | * are to the same unit (unless src is aligned, but it's not). |
344 | */ | 353 | */ |
345 | EXC( LDFIRST t0, FIRST(0)(src), l_exc) | 354 | EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc) |
346 | EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) | 355 | EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy) |
347 | SUB len, len, 4*NBYTES | 356 | SUB len, len, 4*NBYTES |
348 | EXC( LDREST t0, REST(0)(src), l_exc_copy) | 357 | EXC( LDREST t0, REST(0)(src), .Ll_exc_copy) |
349 | EXC( LDREST t1, REST(1)(src), l_exc_copy) | 358 | EXC( LDREST t1, REST(1)(src), .Ll_exc_copy) |
350 | EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) | 359 | EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy) |
351 | EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) | 360 | EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy) |
352 | EXC( LDREST t2, REST(2)(src), l_exc_copy) | 361 | EXC( LDREST t2, REST(2)(src), .Ll_exc_copy) |
353 | EXC( LDREST t3, REST(3)(src), l_exc_copy) | 362 | EXC( LDREST t3, REST(3)(src), .Ll_exc_copy) |
354 | PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed) | 363 | PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed) |
355 | ADD src, src, 4*NBYTES | 364 | ADD src, src, 4*NBYTES |
356 | #ifdef CONFIG_CPU_SB1 | 365 | #ifdef CONFIG_CPU_SB1 |
@@ -361,32 +370,36 @@ EXC( LDREST t3, REST(3)(src), l_exc_copy) | |||
361 | STORE t2, UNIT(2)(dst) | 370 | STORE t2, UNIT(2)(dst) |
362 | STORE t3, UNIT(3)(dst) | 371 | STORE t3, UNIT(3)(dst) |
363 | PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed) | 372 | PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed) |
373 | .set reorder /* DADDI_WAR */ | ||
374 | ADD dst, dst, 4*NBYTES | ||
364 | bne len, rem, 1b | 375 | bne len, rem, 1b |
365 | ADD dst, dst, 4*NBYTES | 376 | .set noreorder |
366 | 377 | ||
367 | cleanup_src_unaligned: | 378 | .Lcleanup_src_unaligned: |
368 | beqz len, done | 379 | beqz len, .Ldone |
369 | and rem, len, NBYTES-1 # rem = len % NBYTES | 380 | and rem, len, NBYTES-1 # rem = len % NBYTES |
370 | beq rem, len, copy_bytes | 381 | beq rem, len, .Lcopy_bytes |
371 | nop | 382 | nop |
372 | 1: | 383 | 1: |
373 | EXC( LDFIRST t0, FIRST(0)(src), l_exc) | 384 | EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc) |
374 | EXC( LDREST t0, REST(0)(src), l_exc_copy) | 385 | EXC( LDREST t0, REST(0)(src), .Ll_exc_copy) |
375 | ADD src, src, NBYTES | 386 | ADD src, src, NBYTES |
376 | SUB len, len, NBYTES | 387 | SUB len, len, NBYTES |
377 | STORE t0, 0(dst) | 388 | STORE t0, 0(dst) |
389 | .set reorder /* DADDI_WAR */ | ||
390 | ADD dst, dst, NBYTES | ||
378 | bne len, rem, 1b | 391 | bne len, rem, 1b |
379 | ADD dst, dst, NBYTES | 392 | .set noreorder |
380 | 393 | ||
381 | copy_bytes_checklen: | 394 | .Lcopy_bytes_checklen: |
382 | beqz len, done | 395 | beqz len, .Ldone |
383 | nop | 396 | nop |
384 | copy_bytes: | 397 | .Lcopy_bytes: |
385 | /* 0 < len < NBYTES */ | 398 | /* 0 < len < NBYTES */ |
386 | #define COPY_BYTE(N) \ | 399 | #define COPY_BYTE(N) \ |
387 | EXC( lb t0, N(src), l_exc); \ | 400 | EXC( lb t0, N(src), .Ll_exc); \ |
388 | SUB len, len, 1; \ | 401 | SUB len, len, 1; \ |
389 | beqz len, done; \ | 402 | beqz len, .Ldone; \ |
390 | sb t0, N(dst) | 403 | sb t0, N(dst) |
391 | 404 | ||
392 | COPY_BYTE(0) | 405 | COPY_BYTE(0) |
@@ -397,16 +410,16 @@ EXC( lb t0, N(src), l_exc); \ | |||
397 | COPY_BYTE(4) | 410 | COPY_BYTE(4) |
398 | COPY_BYTE(5) | 411 | COPY_BYTE(5) |
399 | #endif | 412 | #endif |
400 | EXC( lb t0, NBYTES-2(src), l_exc) | 413 | EXC( lb t0, NBYTES-2(src), .Ll_exc) |
401 | SUB len, len, 1 | 414 | SUB len, len, 1 |
402 | jr ra | 415 | jr ra |
403 | sb t0, NBYTES-2(dst) | 416 | sb t0, NBYTES-2(dst) |
404 | done: | 417 | .Ldone: |
405 | jr ra | 418 | jr ra |
406 | nop | 419 | nop |
407 | END(__copy_user_inatomic) | 420 | END(__copy_user_inatomic) |
408 | 421 | ||
409 | l_exc_copy: | 422 | .Ll_exc_copy: |
410 | /* | 423 | /* |
411 | * Copy bytes from src until faulting load address (or until a | 424 | * Copy bytes from src until faulting load address (or until a |
412 | * lb faults) | 425 | * lb faults) |
@@ -421,12 +434,14 @@ l_exc_copy: | |||
421 | nop | 434 | nop |
422 | LOAD t0, THREAD_BUADDR(t0) | 435 | LOAD t0, THREAD_BUADDR(t0) |
423 | 1: | 436 | 1: |
424 | EXC( lb t1, 0(src), l_exc) | 437 | EXC( lb t1, 0(src), .Ll_exc) |
425 | ADD src, src, 1 | 438 | ADD src, src, 1 |
426 | sb t1, 0(dst) # can't fault -- we're copy_from_user | 439 | sb t1, 0(dst) # can't fault -- we're copy_from_user |
440 | .set reorder /* DADDI_WAR */ | ||
441 | ADD dst, dst, 1 | ||
427 | bne src, t0, 1b | 442 | bne src, t0, 1b |
428 | ADD dst, dst, 1 | 443 | .set noreorder |
429 | l_exc: | 444 | .Ll_exc: |
430 | LOAD t0, TI_TASK($28) | 445 | LOAD t0, TI_TASK($28) |
431 | nop | 446 | nop |
432 | LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address | 447 | LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address |